From 1544d47d847607cf2b9a449d586ebf53f1bc241a Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Date: Tue, 14 Jul 2020 12:02:57 +0900
Subject: [PATCH v1] Fix conversion-table generator scripts

convutils.pm utilized implicit convertion of undefined value into an
integer zero. Some of conversion scripts are susceptible to regexp
greediness. Fix all of them.  Follow the changes of ICU site's
configuration.

This change yields one significant difference in resulting map files
for UHC. The mappings no longer have mappings for characters in the
range c9xx and fexx.
---
 src/backend/utils/mb/Unicode/Makefile         |  2 +-
 .../utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl   |  4 +-
 src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl |  3 +-
 .../utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl |  4 +-
 src/backend/utils/mb/Unicode/convutils.pm     | 56 ++++++++++---------
 5 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index 9084f03009..4645441b64 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 	$(DOWNLOAD) http://x0213.org/codetable/$(@F)
 
 gb-18030-2000.xml windows-949-2000.xml:
-	$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
+	echo "The source for (@F) no longer exists"
 
 GB2312.TXT:
 	$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
index 092a5b44f5..62500efc6d 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
@@ -24,7 +24,7 @@ my @all;
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)[ \t]*U\+(\w+)\+(\w+)[ \t]*#(.*)$/)
 	{
 
 		# combined characters
@@ -45,7 +45,7 @@ while (my $line = <$in>)
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)[ \t]*U\+(\w+)[ \t]*#(.*)$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
index 1d88c0296e..d8bed27e1b 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
@@ -80,7 +80,8 @@ foreach my $i (@$ct932)
 	}
 }
 
-foreach my $i (@mapping)
+# extract only SJIS characers
+foreach my $i (grep defined $_->{sjis}, @mapping)
 {
 	my $sjis = $i->{sjis};
 
diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
index b516e91306..025b0d2798 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
@@ -24,7 +24,7 @@ my @mapping;
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)[ \t]*U\+(\w+)\+(\w+)[ \t]*#(.*)$/)
 	{
 
 		# combined characters
@@ -45,7 +45,7 @@ while (my $line = <$in>)
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)[ \t]*U\+(\w+)[ \t]*#(.*)$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index 2f64a12ea1..9d97061c6f 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -380,7 +380,8 @@ sub print_radix_table
 	  {
 		header  => "Dummy map, for invalid values",
 		min_idx => 0,
-		max_idx => $widest_range
+		max_idx => $widest_range,
+		label => "dummy map"
 	  };
 
 	###
@@ -471,35 +472,37 @@ sub print_radix_table
 	}
 
 	# Also look up the positions of the roots in the table.
-	my $b1root = $segmap{"1-byte"};
-	my $b2root = $segmap{"2-byte"};
-	my $b3root = $segmap{"3-byte"};
-	my $b4root = $segmap{"4-byte"};
+	# Missing map represents dummy mapping.
+	my $b1root = $segmap{"1-byte"} || 0;
+	my $b2root = $segmap{"2-byte"} || 0;
+	my $b3root = $segmap{"3-byte"} || 0;
+	my $b4root = $segmap{"4-byte"} || 0;
 
 	# And the lower-upper values of each level in each radix tree.
-	my $b1_lower = $min_idx{1}{1};
-	my $b1_upper = $max_idx{1}{1};
+	# Missing values represent zero.
+	my $b1_lower = $min_idx{1}{1} || 0;
+	my $b1_upper = $max_idx{1}{1} || 0;
 
-	my $b2_1_lower = $min_idx{2}{1};
-	my $b2_1_upper = $max_idx{2}{1};
-	my $b2_2_lower = $min_idx{2}{2};
-	my $b2_2_upper = $max_idx{2}{2};
+	my $b2_1_lower = $min_idx{2}{1} || 0;
+	my $b2_1_upper = $max_idx{2}{1} || 0;
+	my $b2_2_lower = $min_idx{2}{2} || 0;
+	my $b2_2_upper = $max_idx{2}{2} || 0;
 
-	my $b3_1_lower = $min_idx{3}{1};
-	my $b3_1_upper = $max_idx{3}{1};
-	my $b3_2_lower = $min_idx{3}{2};
-	my $b3_2_upper = $max_idx{3}{2};
-	my $b3_3_lower = $min_idx{3}{3};
-	my $b3_3_upper = $max_idx{3}{3};
+	my $b3_1_lower = $min_idx{3}{1} || 0;
+	my $b3_1_upper = $max_idx{3}{1} || 0;
+	my $b3_2_lower = $min_idx{3}{2} || 0;
+	my $b3_2_upper = $max_idx{3}{2} || 0;
+	my $b3_3_lower = $min_idx{3}{3} || 0;
+	my $b3_3_upper = $max_idx{3}{3} || 0;
 
-	my $b4_1_lower = $min_idx{4}{1};
-	my $b4_1_upper = $max_idx{4}{1};
-	my $b4_2_lower = $min_idx{4}{2};
-	my $b4_2_upper = $max_idx{4}{2};
-	my $b4_3_lower = $min_idx{4}{3};
-	my $b4_3_upper = $max_idx{4}{3};
-	my $b4_4_lower = $min_idx{4}{4};
-	my $b4_4_upper = $max_idx{4}{4};
+	my $b4_1_lower = $min_idx{4}{1} || 0;
+	my $b4_1_upper = $max_idx{4}{1} || 0;
+	my $b4_2_lower = $min_idx{4}{2} || 0;
+	my $b4_2_upper = $max_idx{4}{2} || 0;
+	my $b4_3_lower = $min_idx{4}{3} || 0;
+	my $b4_3_upper = $max_idx{4}{3} || 0;
+	my $b4_4_lower = $min_idx{4}{4} || 0;
+	my $b4_4_upper = $max_idx{4}{4} || 0;
 
 	###
 	### Find the maximum value in the whole table, to determine if we can
@@ -607,7 +610,8 @@ sub print_radix_table
 			for (my $j = 0;
 				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
 			{
-				my $val = $seg->{values}->{$i};
+				# missing values represent zero.
+				my $val = $seg->{values}->{$i} || 0;
 
 				printf $out " 0x%0*x", $colwidth, $val;
 				$off++;
-- 
2.18.4

