From b37d4a85df2799351a870b4811945a2b63dde032 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 6 Oct 2016 19:21:38 +0900
Subject: [PATCH 1/3] Radix tree infrastructure for character encoding

Current character conversion mechanism is based on binary search over
flat tables. Especially for some languages like Japanese, which has so
many as several thousand characters requires over 10 times comparison
for each character. Radix-tree grately alleviates this pain. This
patch adds a generic conversion function, datat types, and a perl
module for converting *.map files into radix tree.
---
 src/backend/utils/mb/Unicode/RADIXCONV.pm | 726 ++++++++++++++++++++++++++++++
 src/backend/utils/mb/conv.c               | 107 ++++-
 src/include/mb/pg_wchar.h                 |  21 +
 3 files changed, 848 insertions(+), 6 deletions(-)
 create mode 100644 src/backend/utils/mb/Unicode/RADIXCONV.pm

diff --git a/src/backend/utils/mb/Unicode/RADIXCONV.pm b/src/backend/utils/mb/Unicode/RADIXCONV.pm
new file mode 100644
index 0000000..e7621a9
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/RADIXCONV.pm
@@ -0,0 +1,726 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/RADIXCONV.pl
+#
+# Helper routinges for generating radix tree file from code mapping files
+
+use strict;
+
+my $radix_type = "pg_mb_radix_tree";
+my $radix_node_type = "pg_mb_radix_node";
+
+#########################################
+# load_chartable(<map file name>)
+#
+# extract data from map files and returns a character table.
+# returns a reference to a hash <in code> => <out code>
+sub load_chartable
+{
+	my($fname) = @_;
+	my %c;
+
+	open(my $in, $fname) || die("cannot open $fname");
+
+	while(<$in>)
+	{
+		if (/^[ \t]*{0x([0-9a-f]+), *0x([0-9a-f]+)},?/)
+		{
+			$c{hex($1)} = hex($2);
+		}
+	}
+
+	return \%c;
+}
+
+#########################################
+# generate_index(<charmap hash ref>)
+#
+# generate a radix tree data from a character table
+# returns a hashref to an index data.
+# {
+#   csegs => <character segment index>
+#   b2idx => [<tree index of 1st byte of 2-byte code>]
+#   b3idx => [<idx for 1st byte for 3-byte code>, <2nd>]
+#   b4idx => [<idx for 1st byte for 4-byte code>, <2nd>, <3rd>]
+# }
+#
+# Tables are in two forms, flat and segmented. a segmented table is
+# logically a two-dimentional table but physically a sequence of
+# segments, fixed length block of items. This structure allows us to
+# shrink table size by overlapping a shared sequence of zeros between
+# successive two segments. overlap_segments does that step.
+#
+# A flat table is simple set of key and value pairs. The value is a
+# segment id of the next segmented table. The next table is referenced
+# using the segment id and the next byte of a code.
+#
+# flat table (b2idx, b3idx1, b4idx1)
+# {
+#    attr => {
+#       segmented => true(1) if this index is segmented>
+#       min  => <minimum value of index key>
+#       max  => <maximum value of index key>
+#		nextidx => <hash reference to the next level table>
+#    }
+#    i    => { # index data
+#       <byte> => <pointer value> # pointer to the next index
+#       ...
+#    }
+#
+# Each segments in segmented table is equivalent to a flat table
+# above.
+#
+# segmented table (csegs, b3idx2, b4idx2, b4idx3)
+# {
+#    attr => {
+#       segmented => true(1) if this index is segmented>
+#       min => <minimum value of index key>
+#       max => <maximum value of index key>
+#		is32bit => true if values are 32bit width, false means 16bit.
+#		next => <hash reference to the next level table, if any>
+#    }
+#    i    => { # segment data
+#       <segid> => { # key for this segment
+#          lower   => <minimum value>
+#          upper   => <maximum value>
+#          offset  => <position of this segment in the whole table>
+#          label => <label string of this segment>
+#          d => { # segment data
+#            <byte> => { # pointer to the next index
+#               label     => <label string for this item>
+#               segid     => <target segid of next level>
+#               segoffset => <offset of the target segid>
+#             }
+#            ...
+#          }
+#        }
+#    }
+# }
+
+sub generate_index
+{
+	my ($c) = @_;
+	my (%csegs, %b2idx, %b3idx1, %b3idx2, %b4idx1, %b4idx2, %b4idx3);
+	my @all_tables =
+		(\%csegs, \%b2idx, \%b3idx1, \%b3idx2, \%b4idx1, \%b4idx2, \%b4idx3);
+	my $si;
+
+	# initialize attributes of index tables
+	$csegs{attr} = {name=> "csegs", chartbl => 1, segmented => 1, is32bit => 0};
+	$b2idx{attr} = {name => "b2idx", segmented => 0, nextidx => \%csegs};
+	$b3idx1{attr} = {name => "b3idx1", segmented => 0, nextidx => \%b3idx2};
+	$b3idx2{attr} = {name => "b3idx2", segmented => 1, nextidx => \%csegs};
+	$b4idx1{attr} = {name => "b4idx1", segmented => 0, nextidx => \%b4idx2};
+	$b4idx2{attr} = {name => "b4idx2", segmented => 1, nextidx => \%b4idx3};
+	$b4idx3{attr} = {name => "b4idx3", segmented => 1, nextidx => \%csegs};
+
+	foreach my $in (keys %$c)
+	{
+		if ($in < 0x100)
+		{
+			my $b1 = $in;
+			# 1 byte code doesn't have index. the first segment #0 of
+			# character table stores them
+			$si = {segid => 0, off => $in, label => "1b-", char => $$c{$in}};
+		}
+		elsif ($in < 0x10000)
+		{
+			# 2-byte code index consists of just one flat table
+			my $b1 = $in >> 8;
+			my $b2 = $in & 0xff;
+			my $csegid = $in >> 8;
+
+			if (! defined $b2idx{i}{$b1})
+			{
+				&set_min_max($b2idx{attr}, $b1);
+				$b2idx{i}{$b1}{segid} = $csegid;
+			}
+			$si = {
+				segid => $csegid,
+				off => $b2,
+				label => sprintf("%02x", $b1),
+				char => $$c{$in}
+			};
+		}
+		elsif ($in < 0x1000000)
+		{
+			# 3-byte code index consists of one flat table and one
+			# segmented table
+			my $b1 = $in >> 16;
+			my $b2 = ($in >> 8) & 0xff;
+			my $b3 = $in & 0xff;
+			my $l1id = $in >> 16;
+			my $csegid = $in >> 8;
+
+			if (! defined $b3idx1{i}{$b1})
+			{
+				&set_min_max($b3idx1{attr}, $b1);
+				$b3idx1{i}{$b1}{segid} = $l1id;
+			}
+			if (! defined $b3idx2{i}{$l1id}{d}{$b2})
+			{
+				&set_min_max($b3idx2{attr}, $b2);
+				$b3idx2{i}{$l1id}{label} = sprintf("%02x", $b1);
+				$b3idx2{i}{$l1id}{d}{$b2} = {
+					segid => $csegid,
+					label => sprintf("%02x%02x", $b1, $b2)
+				}
+			}
+
+			$si = {
+				segid => $csegid,
+				off   => $b3,
+				label => sprintf("%02x%02x", $b1, $b2),
+				char  => $$c{$in}
+			};
+		}
+		elsif ($in < 0x100000000)
+		{
+			# 4-byte code index consists of one flat table, and two
+			# segmented tables
+			my $b1 = $in >> 24;
+			my $b2 = ($in >> 16) & 0xff;
+			my $b3 = ($in >> 8) & 0xff;
+			my $b4 = $in & 0xff;
+			my $l1id = $in >> 24;
+			my $l2id = $in >> 16;
+			my $csegid = $in >> 8;
+
+			if (! defined $b4idx1{i}{$b1})
+			{
+				&set_min_max($b4idx1{attr}, $b1);
+				$b4idx1{i}{$b1}{segid} = $l1id;
+			}
+
+			if (! defined $b4idx2{i}{$l1id}{d}{$b2})
+			{
+				&set_min_max($b4idx2{attr}, $b2);
+				$b4idx2{i}{$l1id}{d}{$b2} = {
+					segid => $l2id,
+					label => sprintf("%02x", $b1)
+				}
+			}
+			if (! defined $b4idx3{i}{$l2id}{d}{$b3})
+			{
+				&set_min_max($b4idx3{attr}, $b3);
+				$b4idx3{i}{$l2id}{d}{$b3} = {
+					segid => $csegid,
+					label => sprintf("%02x%02x", $b1, $b2)
+				}
+			}
+
+			$si = {
+				segid => $csegid,
+				off   => $b4,
+				label => sprintf("%02x%02x%02x", $b1, $b2, $b3),
+				char  => $$c{$in}
+			};
+		}
+		else
+		{
+			die sprintf("up to 4 byte code is supported: %x", $in);
+		}
+
+		&set_min_max($csegs{attr}, $$si{off});
+		$csegs{i}{$$si{segid}}{d}{$$si{off}} = $$si{char};
+		$csegs{i}{$$si{segid}}{label} = $$si{label};
+		$csegs{attr}{is32bit} = 1 if ($$si{char} >= 0x10000);
+		if ($$si{char} >= 0x100000000)
+		{
+			die "character width is over 32bit. abort.";
+		}
+	}
+
+	# calcualte segment attributes
+	foreach my $t (@all_tables)
+	{
+		next if (! defined $$t{i} || ! $$t{attr}{segmented});
+
+		# segments are to be aligned in the numerical order of segment id
+		my @keylist = sort {$a <=> $b} keys $$t{i};
+		next if ($#keylist < 0);
+		my $offset = 1;
+		my $segsize = $$t{attr}{max} - $$t{attr}{min} + 1;
+
+		for my $k (@keylist)
+		{
+			my $seg = $$t{i}{$k};
+			$$seg{lower} = $$t{attr}{min};
+			$$seg{upper} = $$t{attr}{max};
+			$$seg{offset} = $offset;
+			$offset += $segsize;
+		}
+		# overlapping successive zeros between segments
+		&overlap_segments($t);
+	}
+
+	# make link among tables
+	foreach my $t (@all_tables)
+	{
+		&make_index_link($t, $$t{attr}{nextidx});
+	}
+
+	return { name_prefix => "",
+			 csegs => \%csegs,
+			 b2idx => [\%b2idx],
+			 b3idx => [\%b3idx1, \%b3idx2],
+			 b4idx => [\%b4idx1, \%b4idx2, \%b4idx3],
+			 all => \@all_tables};
+}
+
+
+#########################################
+# set_min_max - internal routine to maintain min and max value of a table
+sub set_min_max
+{
+	my ($a, $v) = @_;
+
+	$$a{min} = $v if (! defined $$a{min} || $v < $$a{min});
+	$$a{max} = $v if (! defined $$a{max} || $v > $$a{max});
+}
+
+#########################################
+# overlap_segments
+#
+# removes duplicate regeion between two successive segments.
+
+sub overlap_segments
+{
+	my ($h) = @_;
+
+	# don't touch if undefined
+	return if (! defined $$h{i} || !$$h{attr}{segmented});
+	my $index = $$h{i};
+	my ($min, $max) = ($$h{attr}{min}, $$h{attr}{max});
+	my ($prev, $first);
+	my @segids = sort {$a <=> $b} keys $index;
+	return if ($#segids < 1);
+
+	$first = 1;
+	undef $prev;
+
+	for my $segid (@segids)
+	{
+		my $seg = $$index{$segid};
+
+		# smin and smax is range excluded preceeding and trailing zeros
+		my @keys = sort {$a <=> $b} keys $$seg{d};
+		my $smin = $keys[0];
+		my $smax = $keys[-1];
+
+		if ($first)
+		{
+			# first segment doesn't have a preceding segment
+			$$seg{offset} = 1;
+			$$seg{lower} = $min;
+			$$seg{upper} = $smax;
+		}
+		else
+		{
+			# calculate overlap and shift segment location
+			my $prefix		= $smin - $min;
+			my $postfix		= $max  - $smax;
+			my $prevpostfix	= $max - $$prev{upper};
+			my $overlap = $prevpostfix < $prefix ? $prevpostfix : $prefix;
+
+			$$seg{lower}  = $min + $overlap;
+			$$seg{upper} = $smax;
+			$$seg{offset} =	$$prev{offset} + ($max - $min + 1) - $overlap;
+			$$prev{upper} = $max;
+		}
+		$prev = $seg;
+		$first = 0;
+	}
+
+	return $h;
+}
+
+######################################################
+# make_index_link(from_table, to_table)
+#
+# Fills out target pointers in non-leaf index tables.
+#
+# from_table : table to set links
+# to_table   : target table of from_table
+
+sub make_index_link
+{
+	my ($s, $t) = @_;
+	return if (! defined $$s{i} || ! defined $$t{i});
+
+	my @tkeys = sort {$a <=> $b} keys $$t{i};
+
+	if ($$s{attr}{segmented})
+	{
+		foreach my $k1 (keys $$s{i})
+		{
+			foreach my $k2 (keys $$s{i}{$k1}{d})
+			{
+				my $tsegid = $$s{i}{$k1}{d}{$k2}{segid};
+				if (! defined $tsegid)
+				{
+					die sprintf("segid is not set in %s{i}{%x}{d}{%x}{segid}",
+								$$s{attr}{name}, $k1, $k2);
+				}
+				$$s{i}{$k1}{d}{$k2}{segoffset} = $$t{i}{$tsegid}{offset};
+			}
+		}
+	}
+	else
+	{
+		foreach my $k (keys $$s{i})
+		{
+			my $tsegid = $$s{i}{$k}{segid};
+			if (! defined $tsegid)
+			{
+				die sprintf("segid is not set in %s{i}{%x}{segid}",
+							$$s{attr}{name}, $k);
+			}
+			$$s{i}{$k}{segoffset} = $$t{i}{$tsegid}{offset};
+		}
+	}
+}
+
+#########################################
+# set_name_prefix - set name prefix string of C struct
+sub set_name_prefix
+{
+	my ($trie, $p) = @_;
+
+	$$trie{name_prefix} = $p;
+}
+
+
+
+
+###############################################
+# write_table - output index table as C-struct
+#
+# write_table(hd, table, tblname, width)
+# returns 1 if the table is written
+#
+# hd      : file handle to write
+# table   : ref to an index table
+# tblname : C symbol name for the table
+# width   : width in characters of this table
+
+sub write_table
+{
+	my($hd, $table, $tblname, $width) = @_;
+
+	return 0 if (! defined $$table{i});
+
+	if ($$table{attr}{chartbl})
+	{
+		&write_chars_table($hd, $table, $tblname, $width);
+	}
+	elsif ($$table{attr}{segmented})
+	{
+		&write_segmented_table($hd, $table, $tblname, $width);
+	}
+	else
+	{
+		&write_flat_table($hd, $table, $tblname, $width);
+	}
+	return 1;
+}
+
+#########################################
+# write_chars_table
+#
+# write_chars_table(hd, table, tblname, width)
+# this is usually called via writ_table
+#
+# hd      : file handle to write
+# table   : ref to an index table
+# tblname : C symbol name for the table
+# tblwidth: width in characters of this table
+
+sub write_chars_table
+{
+	my($hd, $table, $tblname, $width) = @_;
+	my($st, $ed) = ($$table{attr}{min}, $$table{attr}{max});
+	my($type) = $$table{attr}{is32bit} ? "uint32" : "uint16";
+
+	printf(OUT "static const %s %s[] =\n{", $type, $tblname);
+	printf(OUT " /* chars content - index range = [%02x, %02x] */", $st, $ed);
+
+	# values in character table are written in fixedwidth
+	# hexadecimals.  calculate the number of columns in a line. 13 is
+	# the length of line header.
+
+	my $colwidth = $$table{attr}{is32bit} ? 8 : 4;
+	my $colseplen = 4; # the length of  ", 0x"
+	my $headerlength = 13;
+	my $colnum = int(($width - $headerlength)  / ($colwidth + $colseplen));
+
+	# round down to multiples of 4. don't bother by too small table width
+	my $colnum = int($colnum / 4) * 4;
+	my $line = "";
+	my $first0 = 1;
+	# output all segments in segment id order
+	foreach my $k (sort {$a <=> $b} keys $$table{i})
+	{
+		my $s = $$table{i}{$k};
+		if (!$first0)
+		{
+			$line =~ s/\s+$//;		# remove trailing space
+			print $hd $line, ",\n";
+			$line = "";
+		}
+		$first0 = 0;
+
+		# write segment header
+		printf($hd "\n  /*** %4sxx - offset 0x%05x ***/",
+			   $$s{label}, $$s{offset});
+
+		# write segment content
+		my $first1 = 1;
+		my ($segstart, $segend) = ($$s{lower}, $$s{upper});
+		my($xpos, $nocomma) = (0, 0);
+
+		foreach my $j (($segstart - ($segstart % $colnum)) .. $segend)
+		{
+			$line .= "," if (!$first1 && !$nocomma);
+
+			# write the previous line and put a line header for the
+			# new line if this is the first time or this line is full
+			if ($xpos >= $colnum || $first1)
+			{
+				$line =~ s/\s+$//;	# remove trailing space
+				print $hd $line, "\n";
+				$line = sprintf("  /* %02x */ ", $j);
+				$xpos = 0;
+			}
+			else
+			{
+				$line .= " ";
+			}
+			$first1 = 0;
+
+			# write each column
+			if ($j >= $segstart)
+			{
+				$line .= sprintf("0x%0*x", $colwidth, $$s{d}{$j});
+				$nocomma = 0;
+			}
+			else
+			{
+				# adjust column position
+				$line .= " " x ($colwidth + 3);
+				$nocomma = 1;
+			}
+			$xpos++;
+		}
+
+	}
+
+	$line =~ s/\s+$//;
+	print $hd $line, "\n};\n";
+}
+
+######################################################
+# write_flat_table - output nonsegmented index table
+#
+# write_flat_table(hd, table, tblname, width)
+# this is usually called via writ_table
+#
+# hd      : file handle to write
+# table   : ref to an index table
+# tblname : C symbol name for the table
+# width   : width in characters of this table
+
+sub write_flat_table
+{
+	my($hd, $table, $tblname, $width) = @_;
+	my($st, $ed) = ($$table{attr}{min}, $$table{attr}{max});
+
+	print $hd "static const $radix_node_type $tblname =\n{";
+	printf($hd "\n  0x%x, 0x%x, /* table range */\n", $st, $ed);
+	print $hd "  {";
+
+	my $first = 1;
+	my $line = "";
+
+	foreach my $i ($st .. $ed)
+	{
+		$line .= "," if (!$first);
+		my $newitem = sprintf("%d",
+							  defined $$table{i}{$i} ?
+							  $$table{i}{$i}{segoffset} : 0);
+
+		# flush current line and feed a line if the current line
+		# exceeds a limit
+		if ($first || length($line.$newitem) > $width)
+		{
+			$line =~ s/\s+$//;		# remove trailing space
+			print $hd "$line\n";
+			$line = "    ";
+		}
+		else
+		{
+			$line .= " ";
+		}
+		$line .= $newitem;
+		$first = 0;
+	}
+	print $hd $line;
+	print $hd "\n  }\n};\n";
+}
+
+######################################################
+# write_segmented_table - output segmented index table
+#
+# write_segmented_table(hd, table, tblname, width)
+# this is usually called via writ_table
+#
+# hd      : file handle to write
+# table   : ref to an index table
+# tblname : C symbol name for the table
+# width   : width in characters of this table
+
+sub write_segmented_table
+{
+	my($hd, $table, $tblname, $width) = @_;
+	my ($st, $ed) = ($$table{attr}{min}, $$table{attr}{max});
+
+	# write the variable definition
+	print $hd "static const $radix_node_type $tblname =\n{";
+	printf($hd "\n  0x%02x, 0x%02x,		/*index range */\n  {",  $st, $ed);
+
+	my $first0 = 1;
+	foreach my $k (sort {$a <=> $b} keys $$table{i})
+	{
+		print $hd ",\n" if (!$first0);
+		$first0 = 0;
+		printf($hd "\n  /*** %sxxxx - offset 0x%05x ****/",
+			   $$table{i}{$k}{label}, $$table{i}{$k}{offset});
+
+		my $segstart = $$table{i}{$k}{lower};
+		my $segend	 = $$table{i}{$k}{upper};
+
+		my $line = "";
+		my $first1 = 1;
+		my $newitem = "";
+
+		foreach my $j ($segstart .. $segend)
+		{
+			$line .= "," if (!$first1);
+			$newitem = sprintf("%d", $$table{i}{$k}{d}{$j} ?
+							   $$table{i}{$k}{d}{$j}{segoffset} : 0);
+
+			if ($first1 || length($line.$newitem) > $width)
+			{
+				$line =~ s/\s+$//;
+				print OUT "$line\n";
+				$line = sprintf("  /* %2s%02x */ ", $$table{i}{$k}{label}, $j);
+			}
+			else
+			{
+				$line .= " ";
+			}
+			$line .= $newitem;
+			$first1 = 0;
+		}
+		print $hd $line;
+	}
+	print $hd "\n  }\n};\n";
+}
+
+#########################################
+# make_table_refname(table, prefix)
+#
+# internal routine to make C reference notation for tables
+
+sub make_table_refname
+{
+	my ($table, $prefix) = @_;
+
+	return "NULL" if (! defined $$table{i});
+	return "&" . $prefix . $$table{attr}{name};
+}
+
+#########################################
+# write_main_table(hd, tblname, trie, nameprefix)
+#
+# write main radix tree table
+#
+# hd         : file handle to write this table
+# tblname    : variable name of this struct
+# trie       : ref to a radix tree
+# nameprefix : prefix for subtables.
+
+sub write_main_table
+{
+	my ($hd, $tblname, $trie) = @_;
+	my $ctblname = $$trie{name_prefix}.$$trie{csegs}{attr}{name};
+	my ($ctbl16name, $ctbl32name);
+	if ($$trie{csegs}{attr}{is32bit})
+	{
+		$ctbl16name = "NULL";  $ctbl32name = $ctblname;
+	}
+	else
+	{
+		$ctbl16name = $ctblname;  $ctbl32name = "NULL";
+	}
+
+	my $b2iname  = make_table_refname($$trie{b2idx}[0], $$trie{name_prefix});
+	my $b3i1name = make_table_refname($$trie{b3idx}[0], $$trie{name_prefix});
+	my $b3i2name = make_table_refname($$trie{b3idx}[1], $$trie{name_prefix});
+	my $b4i1name = make_table_refname($$trie{b4idx}[0], $$trie{name_prefix});
+	my $b4i2name = make_table_refname($$trie{b4idx}[1], $$trie{name_prefix});
+	my $b4i3name = make_table_refname($$trie{b4idx}[2], $$trie{name_prefix});
+
+	print  $hd "static const $radix_type $tblname =\n{\n";
+	print  $hd "	/* final character table offset and body */\n";
+	printf($hd "	0x%x, 0x%x, %s, %s,\n",
+		   $$trie{csegs}{attr}{min}, $$trie{csegs}{attr}{max},
+		   $ctbl16name, $ctbl32name);
+
+	print  $hd "	/* 2-byte code table */\n";
+	print  $hd "	$b2iname,\n";
+	print  $hd "	/* 3-byte code tables */\n";
+	print  $hd "	{$b3i1name, $b3i2name},\n";
+	print  $hd "	/* 4-byte code table */\n";
+	print  $hd "	{$b4i1name, $b4i2name, $b4i3name},\n";
+	print  $hd "};\n";
+}
+
+#########################################
+# write_map_content - write the whole content of C source of tadix tree
+#
+# write_map_content(this_script, fname, trie, tblname, tblwidth)
+#
+# this_script : the name of the *caller script* of this feature
+# fname       : output file name
+# trie        : ref to a radix tree
+# tblname     : the name of main table of C-data
+# tblwidth    : width in characters of output source file
+
+sub write_map_content
+{
+	my ($this_script, $fname, $trie, $tblname, $tblwidth) = @_;
+
+	open(OUT, "> $fname") || die("cannot open $fname");
+
+	print OUT "/* This file is generated by $this_script */\n\n";
+
+	foreach my $t (@{$$trie{all}})
+	{
+		my $table_name = $$trie{name_prefix}.$$t{attr}{name};
+
+		if (&write_table(*OUT, $t, $table_name, $tblwidth))
+		{
+			print OUT "\n";
+		}
+	}
+
+	&write_main_table(*OUT, $tblname, $trie);
+	close(OUT);
+}
+
+
+
+1;
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index d50336b..8658578 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -364,6 +364,98 @@ store_coded_char(unsigned char *dest, uint32 code)
 }
 
 /*
+ * radix tree conversion function
+ */
+const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c)
+{
+	uint32 off = 0;
+	uint32 b1 = c >> 24;
+	uint32 b2 = (c >> 16) & 0xff;
+	uint32 b3 = (c >> 8) & 0xff;
+	uint32 b4 = c & 0xff;
+
+	if (b1 > 0)
+	{
+		/* 4-byte code */
+		uint32 idx;
+
+		/* check code validity - fist byte */
+		if (rt->b4idx[0] == NULL ||
+			b1 < rt->b4idx[0]->lower || b1 > rt->b4idx[0]->upper)
+			return 0;
+
+		idx = b1 - rt->b4idx[0]->lower;
+		off = rt->b4idx[0]->idx[idx];
+		if (off == 0)
+			return 0;
+
+		/* check code validity - second byte */
+		if (b2 < rt->b4idx[1]->lower || b2 > rt->b4idx[1]->upper)
+			return 0;
+
+		idx = b2 - rt->b4idx[1]->lower;
+		off = (rt->b4idx[1]->idx + off - 1)[idx];
+		if (off == 0)
+			return 0;
+
+		/* check code validity - third byte */
+		if (b3 < rt->b4idx[2]->lower || b3 > rt->b4idx[2]->upper)
+			return 0;
+
+		idx = b3 - rt->b4idx[2]->lower;
+		off = (rt->b4idx[2]->idx + off - 1)[idx];
+	}
+	else if (b2 > 0)
+	{
+		/* 3-byte code */
+
+		uint32 idx;
+
+		/* check code validity - first byte */
+		if (rt->b3idx[0] == NULL ||
+			b2 < rt->b3idx[0]->lower || b2 > rt->b3idx[0]->upper)
+			return 0;
+
+		idx = b2 - rt->b3idx[0]->lower;
+		off = rt->b3idx[0]->idx[idx];
+		if (off == 0)
+			return 0;
+
+		/* check code validity - second byte */
+		if (b3 < rt->b3idx[1]->lower || b3 > rt->b3idx[1]->upper)
+			return 0;
+
+		idx = b3 - rt->b3idx[1]->lower;
+		off = (rt->b3idx[1]->idx + off - 1)[idx];
+	}
+	else if (b3 > 0)
+	{
+		/* 2-byte code */
+		uint32 idx;
+
+		/* check code validity - first byte */
+		if (rt->b2idx == NULL ||
+			b3 < rt->b2idx->lower || b3 > rt->b2idx->upper)
+			return 0;
+
+		idx = b3 - rt->b2idx->lower;
+		off = rt->b2idx->idx[idx];
+	}
+
+	if (off == 0)
+		return 0;
+
+	/* check code validity - last byte */
+	if (b4 < rt->chars_lower || b4 > rt->chars_upper)
+		return 0;
+
+	if (rt->chars32)
+		return (rt->chars32 + off - 1)[b4 - rt->chars_lower];
+	else
+		return (rt->chars16 + off - 1)[b4 - rt->chars_lower];
+}
+
+/*
  * UTF8 ---> local code
  *
  * utf: input string in UTF8 encoding (need not be null-terminated)
@@ -516,13 +608,16 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* Now check ordinary map */
-		p = bsearch(&iutf, map, mapsize,
-					sizeof(pg_utf_to_local), compare1);
-
-		if (p)
+		if (map)
 		{
-			iso = store_coded_char(iso, p->code);
-			continue;
+			p = bsearch(&iutf, map, mapsize,
+						sizeof(pg_utf_to_local), compare1);
+
+			if (p)
+			{
+				iso = store_coded_char(iso, p->code);
+				continue;
+			}
 		}
 
 		/* if there's a conversion function, try that */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 24e8d0d..5b2094b 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -384,6 +384,26 @@ typedef struct
 } pg_utf_to_local;
 
 /*
+ * radix tree structer for faster conversion
+ */
+typedef struct pg_mb_radix_index
+{
+	uint8	lower, upper;				/* index range of b2idx */
+	uint32	idx[FLEXIBLE_ARRAY_MEMBER];	/* index body */
+} pg_mb_radix_node;
+
+typedef struct
+{
+	const uint8	chars_lower, chars_upper;	/* index range of chars* */
+	const uint16 *chars16;					/* 16 bit character table */
+	const uint32 *chars32;					/* 32 bit character table */
+
+	const pg_mb_radix_node *b2idx;
+	const pg_mb_radix_node *b3idx[2];
+	const pg_mb_radix_node *b4idx[3];
+} pg_mb_radix_tree;
+
+/*
  * local code to UTF-8 conversion map
  */
 typedef struct
@@ -551,6 +571,7 @@ extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
 extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 					 int len, int lc, int encoding,
 					 const unsigned char *tab);
+extern const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c);
 
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 
-- 
2.9.2

