From e59292beb2aeb1860734ead992cea38f59f6cab6 Mon Sep 17 00:00:00 2001
From: Jacob Champion <pchampion@vmware.com>
Date: Wed, 21 Jul 2021 10:41:05 -0700
Subject: [PATCH] ucs_wcwidth: update Fullwidth and Wide codepoint set

The hardcoded "wide character" set at the end of ucs_wcwidth() was last
touched around the Unicode 5.0 era.  This led to misalignment on modern
platforms when printing emoji and other codepoints that have since been
designated wide/fullwidth.

Use an interval table for these codepoints, and add a recipe to the
existing update-unicode rule to keep it up to date.

TODO: performance implications
---
 src/common/unicode/.gitignore                 |   1 +
 src/common/unicode/Makefile                   |   9 +-
 .../generate-unicode_east_asian_fw_table.pl   |  76 +++++++++++
 src/common/wchar.c                            |  18 +--
 .../common/unicode_east_asian_fw_table.h      | 120 ++++++++++++++++++
 5 files changed, 208 insertions(+), 16 deletions(-)
 create mode 100644 src/common/unicode/generate-unicode_east_asian_fw_table.pl
 create mode 100644 src/include/common/unicode_east_asian_fw_table.h

diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore
index 512862e538..46243f701d 100644
--- a/src/common/unicode/.gitignore
+++ b/src/common/unicode/.gitignore
@@ -4,5 +4,6 @@
 # Downloaded files
 /CompositionExclusions.txt
 /DerivedNormalizationProps.txt
+/EastAsianWidth.txt
 /NormalizationTest.txt
 /UnicodeData.txt
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index eb14add28a..a3683dd86b 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
 	mv $^ ../../../src/include/common/
 	$(MAKE) normalization-check
 
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 # Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
 unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
 	$(PERL) $^ >$@
 
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+	$(PERL) $^ >$@
+
 unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
 	$(PERL) $^ >$@
 
@@ -64,6 +67,6 @@ clean:
 	rm -f $(OBJS) norm_test norm_test.o
 
 distclean: clean
-	rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+	rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
 
 maintainer-clean: distclean
diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
new file mode 100644
index 0000000000..d29fdd5157
--- /dev/null
+++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+#
+# Generate sorted list of non-overlapping intervals of East Asian Wide (W) and
+# East Asian Fullwidth (F) characters, using Unicode data files as input.  Pass
+# EastAsianWidth.txt as argument.  The output is on stdout.
+#
+# Copyright (c) 2019-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	$line =~ s/\s*#.*$//;
+	next if $line eq '';
+	my ($codepoint, $width) = split ';', $line;
+
+	if ($codepoint =~ /\.\./)
+	{
+		($first, $last) = split /\.\./, $codepoint;
+	}
+	else
+	{
+		$first = $last = $codepoint;
+	}
+
+	($first, $last) = map(hex, ($first, $last));
+
+	if ($width eq 'F' || $width eq 'W')
+	{
+		# fullwidth/wide characters
+		if (!defined($range_start))
+		{
+			# save for start of range if one hasn't been started yet
+			$range_start = $first;
+		}
+		elsif ($first != $prev_last + 1)
+		{
+			# ranges aren't contiguous; emit the last and start a new one
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = $first;
+		}
+	}
+	else
+	{
+		# not wide characters, print out previous range if any
+		if (defined($range_start))
+		{
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = undef;
+		}
+	}
+}
+continue
+{
+	$prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+	printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..43f1078ae6 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -583,8 +583,8 @@ pg_utf_mblen(const unsigned char *s)
 
 struct mbinterval
 {
-	unsigned short first;
-	unsigned short last;
+	unsigned int first;
+	unsigned int last;
 };
 
 /* auxiliary function for binary search in interval table */
@@ -645,6 +645,7 @@ static int
 ucs_wcwidth(pg_wchar ucs)
 {
 #include "common/unicode_combining_table.h"
+#include "common/unicode_east_asian_fw_table.h"
 
 	/* test for 8-bit control characters */
 	if (ucs == 0)
@@ -663,17 +664,8 @@ ucs_wcwidth(pg_wchar ucs)
 	 */
 
 	return 1 +
-		(ucs >= 0x1100 &&
-		 (ucs <= 0x115f ||		/* Hangul Jamo init. consonants */
-		  (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
-		   ucs != 0x303f) ||	/* CJK ... Yi */
-		  (ucs >= 0xac00 && ucs <= 0xd7a3) ||	/* Hangul Syllables */
-		  (ucs >= 0xf900 && ucs <= 0xfaff) ||	/* CJK Compatibility
-												 * Ideographs */
-		  (ucs >= 0xfe30 && ucs <= 0xfe6f) ||	/* CJK Compatibility Forms */
-		  (ucs >= 0xff00 && ucs <= 0xff5f) ||	/* Fullwidth Forms */
-		  (ucs >= 0xffe0 && ucs <= 0xffe6) ||
-		  (ucs >= 0x20000 && ucs <= 0x2ffff)));
+		mbbisearch(ucs, east_asian_fw,
+				   sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1);
 }
 
 /*
diff --git a/src/include/common/unicode_east_asian_fw_table.h b/src/include/common/unicode_east_asian_fw_table.h
new file mode 100644
index 0000000000..b27f95b5dc
--- /dev/null
+++ b/src/include/common/unicode_east_asian_fw_table.h
@@ -0,0 +1,120 @@
+/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */
+
+static const struct mbinterval east_asian_fw[] = {
+	{0x1100, 0x115F},
+	{0x231A, 0x231B},
+	{0x2329, 0x232A},
+	{0x23E9, 0x23EC},
+	{0x23F0, 0x23F0},
+	{0x23F3, 0x23F3},
+	{0x25FD, 0x25FE},
+	{0x2614, 0x2615},
+	{0x2648, 0x2653},
+	{0x267F, 0x267F},
+	{0x2693, 0x2693},
+	{0x26A1, 0x26A1},
+	{0x26AA, 0x26AB},
+	{0x26BD, 0x26BE},
+	{0x26C4, 0x26C5},
+	{0x26CE, 0x26CE},
+	{0x26D4, 0x26D4},
+	{0x26EA, 0x26EA},
+	{0x26F2, 0x26F3},
+	{0x26F5, 0x26F5},
+	{0x26FA, 0x26FA},
+	{0x26FD, 0x26FD},
+	{0x2705, 0x2705},
+	{0x270A, 0x270B},
+	{0x2728, 0x2728},
+	{0x274C, 0x274C},
+	{0x274E, 0x274E},
+	{0x2753, 0x2755},
+	{0x2757, 0x2757},
+	{0x2795, 0x2797},
+	{0x27B0, 0x27B0},
+	{0x27BF, 0x27BF},
+	{0x2B1B, 0x2B1C},
+	{0x2B50, 0x2B50},
+	{0x2B55, 0x2B55},
+	{0x2E80, 0x2E99},
+	{0x2E9B, 0x2EF3},
+	{0x2F00, 0x2FD5},
+	{0x2FF0, 0x2FFB},
+	{0x3000, 0x303E},
+	{0x3041, 0x3096},
+	{0x3099, 0x30FF},
+	{0x3105, 0x312F},
+	{0x3131, 0x318E},
+	{0x3190, 0x31E3},
+	{0x31F0, 0x321E},
+	{0x3220, 0x3247},
+	{0x3250, 0x4DBF},
+	{0x4E00, 0xA48C},
+	{0xA490, 0xA4C6},
+	{0xA960, 0xA97C},
+	{0xAC00, 0xD7A3},
+	{0xF900, 0xFAFF},
+	{0xFE10, 0xFE19},
+	{0xFE30, 0xFE52},
+	{0xFE54, 0xFE66},
+	{0xFE68, 0xFE6B},
+	{0xFF01, 0xFF60},
+	{0xFFE0, 0xFFE6},
+	{0x16FE0, 0x16FE4},
+	{0x16FF0, 0x16FF1},
+	{0x17000, 0x187F7},
+	{0x18800, 0x18CD5},
+	{0x18D00, 0x18D08},
+	{0x1B000, 0x1B11E},
+	{0x1B150, 0x1B152},
+	{0x1B164, 0x1B167},
+	{0x1B170, 0x1B2FB},
+	{0x1F004, 0x1F004},
+	{0x1F0CF, 0x1F0CF},
+	{0x1F18E, 0x1F18E},
+	{0x1F191, 0x1F19A},
+	{0x1F200, 0x1F202},
+	{0x1F210, 0x1F23B},
+	{0x1F240, 0x1F248},
+	{0x1F250, 0x1F251},
+	{0x1F260, 0x1F265},
+	{0x1F300, 0x1F320},
+	{0x1F32D, 0x1F335},
+	{0x1F337, 0x1F37C},
+	{0x1F37E, 0x1F393},
+	{0x1F3A0, 0x1F3CA},
+	{0x1F3CF, 0x1F3D3},
+	{0x1F3E0, 0x1F3F0},
+	{0x1F3F4, 0x1F3F4},
+	{0x1F3F8, 0x1F43E},
+	{0x1F440, 0x1F440},
+	{0x1F442, 0x1F4FC},
+	{0x1F4FF, 0x1F53D},
+	{0x1F54B, 0x1F54E},
+	{0x1F550, 0x1F567},
+	{0x1F57A, 0x1F57A},
+	{0x1F595, 0x1F596},
+	{0x1F5A4, 0x1F5A4},
+	{0x1F5FB, 0x1F64F},
+	{0x1F680, 0x1F6C5},
+	{0x1F6CC, 0x1F6CC},
+	{0x1F6D0, 0x1F6D2},
+	{0x1F6D5, 0x1F6D7},
+	{0x1F6EB, 0x1F6EC},
+	{0x1F6F4, 0x1F6FC},
+	{0x1F7E0, 0x1F7EB},
+	{0x1F90C, 0x1F93A},
+	{0x1F93C, 0x1F945},
+	{0x1F947, 0x1F978},
+	{0x1F97A, 0x1F9CB},
+	{0x1F9CD, 0x1F9FF},
+	{0x1FA70, 0x1FA74},
+	{0x1FA78, 0x1FA7A},
+	{0x1FA80, 0x1FA86},
+	{0x1FA90, 0x1FAA8},
+	{0x1FAB0, 0x1FAB6},
+	{0x1FAC0, 0x1FAC2},
+	{0x1FAD0, 0x1FAD6},
+	{0x20000, 0x2FFFD},
+	{0x30000, 0x3FFFD},
+};
-- 
2.25.1

