From 9c61aa3604af862a8c8217eee8d268b80ae06a2d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 14 Dec 2020 18:28:45 +0200
Subject: [PATCH 1/5] Add new mbverifystr() function for each encoding.

This potentially makes pg_verify_mbstr() function faster, by allowing
more efficient encoding-specific implementations. All of the
implementations in this patch are pretty naive, though.
---
 src/backend/commands/extension.c              |   2 +-
 src/backend/utils/mb/conv.c                   |   2 +-
 .../euc2004_sjis2004/euc2004_sjis2004.c       |   4 +-
 .../euc_jp_and_sjis/euc_jp_and_sjis.c         |  10 +-
 .../euc_kr_and_mic/euc_kr_and_mic.c           |   4 +-
 .../euc_tw_and_big5/euc_tw_and_big5.c         |   8 +-
 src/backend/utils/mb/mbutils.c                |  31 +-
 src/common/wchar.c                            | 514 +++++++++++++++---
 src/include/mb/pg_wchar.h                     |  10 +-
 9 files changed, 491 insertions(+), 94 deletions(-)

diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index b5630b4c8d9..82f1248dbf1 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -682,7 +682,7 @@ read_extension_script_file(const ExtensionControlFile *control,
 		src_encoding = control->encoding;
 
 	/* make sure that source string is valid in the expected encoding */
-	pg_verify_mbstr_len(src_encoding, src_str, len, false);
+	(void) pg_verify_mbstr(src_encoding, src_str, len, false);
 
 	/*
 	 * Convert the encoding to the database encoding. read_whole_file
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index 54dcf71fb75..192948caad2 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -653,7 +653,7 @@ LocalToUtf(const unsigned char *iso, int len,
 			continue;
 		}
 
-		l = pg_encoding_verifymb(encoding, (const char *) iso, len);
+		l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
 		if (l < 0)
 			break;
 
diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
index 9ba6bd30405..3628e690aa1 100644
--- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
@@ -87,7 +87,7 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 			continue;
 		}
 
-		l = pg_encoding_verifymb(PG_EUC_JIS_2004, (const char *) euc, len);
+		l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
 
 		if (l < 0)
 			report_invalid_encoding(PG_EUC_JIS_2004,
@@ -238,7 +238,7 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 			continue;
 		}
 
-		l = pg_encoding_verifymb(PG_SHIFT_JIS_2004, (const char *) sjis, len);
+		l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
 
 		if (l < 0 || l > len)
 			report_invalid_encoding(PG_SHIFT_JIS_2004,
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
index 4ca8e2126e4..ea05436596d 100644
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -291,7 +291,7 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
@@ -381,7 +381,7 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
@@ -431,7 +431,7 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
@@ -485,7 +485,7 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
@@ -580,7 +580,7 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
+		l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
 		if (l < 0)
 			report_invalid_encoding(PG_SJIS,
 									(const char *) sjis, len);
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
index 4d7876a666e..600c5cbc5cd 100644
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -76,7 +76,7 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		c1 = *euc;
 		if (IS_HIGHBIT_SET(c1))
 		{
-			l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
+			l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
 			if (l != 2)
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
@@ -122,7 +122,7 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
index 82a22b9bebf..7e4c2697b07 100644
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -148,7 +148,7 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		c1 = *euc;
 		if (IS_HIGHBIT_SET(c1))
 		{
-			l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
+			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
@@ -213,7 +213,7 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
@@ -272,7 +272,7 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
+		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 		if (l < 0)
 			report_invalid_encoding(PG_BIG5,
 									(const char *) big5, len);
@@ -321,7 +321,7 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 			len--;
 			continue;
 		}
-		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index a8e13cacfde..67d1c4fc19f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -519,7 +519,7 @@ pg_convert(PG_FUNCTION_ARGS)
 	/* make sure that source string is valid */
 	len = VARSIZE_ANY_EXHDR(string);
 	src_str = VARDATA_ANY(string);
-	pg_verify_mbstr_len(src_encoding, src_str, len, false);
+	(void) pg_verify_mbstr(src_encoding, src_str, len, false);
 
 	/* perform conversion */
 	dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
@@ -1215,10 +1215,10 @@ static bool
 pg_generic_charinc(unsigned char *charptr, int len)
 {
 	unsigned char *lastbyte = charptr + len - 1;
-	mbverifier	mbverify;
+	mbchar_verifier mbverify;
 
 	/* We can just invoke the character verifier directly. */
-	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
 
 	while (*lastbyte < (unsigned char) 255)
 	{
@@ -1445,8 +1445,7 @@ pg_database_encoding_max_length(void)
 bool
 pg_verifymbstr(const char *mbstr, int len, bool noError)
 {
-	return
-		pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
+	return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
 }
 
 /*
@@ -1456,7 +1455,18 @@ pg_verifymbstr(const char *mbstr, int len, bool noError)
 bool
 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 {
-	return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+	int			oklen;
+
+	Assert(PG_VALID_ENCODING(encoding));
+
+	oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
+	if (oklen != len)
+	{
+		if (noError)
+			return false;
+		report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
+	}
+	return true;
 }
 
 /*
@@ -1469,11 +1479,14 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
  * If OK, return length of string in the encoding.
  * If a problem is found, return -1 when noError is
  * true; when noError is false, ereport() a descriptive message.
+ *
+ * Note: We cannot use the faster encoding-specific mbverifystr() function
+ * here, because we need to count the number of characters in the string.
  */
 int
 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 {
-	mbverifier	mbverify;
+	mbchar_verifier	mbverifychar;
 	int			mb_len;
 
 	Assert(PG_VALID_ENCODING(encoding));
@@ -1493,7 +1506,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 	}
 
 	/* fetch function pointer just once */
-	mbverify = pg_wchar_table[encoding].mbverify;
+	mbverifychar = pg_wchar_table[encoding].mbverifychar;
 
 	mb_len = 0;
 
@@ -1516,7 +1529,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 			report_invalid_encoding(encoding, mbstr, len);
 		}
 
-		l = (*mbverify) ((const unsigned char *) mbstr, len);
+		l = (*mbverifychar) ((const unsigned char *) mbstr, len);
 
 		if (l < 0)
 		{
diff --git a/src/common/wchar.c b/src/common/wchar.c
index efaf1c155bb..5ab29bcbc39 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -19,7 +19,7 @@
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
  *
- * To add an encoding support, define mblen(), dsplen() and verifier() for
+ * To add an encoding support, define mblen(), dsplen(), verifychar() and verifystr() for
  * the encoding.  For server-encodings, also define mb2wchar() and wchar2mb()
  * conversion functions.
  *
@@ -1087,29 +1087,47 @@ pg_gb18030_dsplen(const unsigned char *s)
  *-------------------------------------------------------------------
  * multibyte sequence validators
  *
- * These functions accept "s", a pointer to the first byte of a string,
+ * The verifychar functions accept "s", a pointer to the first byte of a string,
  * and "len", the remaining length of the string.  If there is a validly
  * encoded character beginning at *s, return its length in bytes; else
  * return -1.
  *
- * The functions can assume that len > 0 and that *s != '\0', but they must
+ * The verifychar functions can assume that len > 0 and that *s != '\0', but they must
  * test for and reject zeroes in any additional bytes of a multibyte character.
- *
  * Note that this definition allows the function for a single-byte
  * encoding to be just "return 1".
+ *
+ * The verifystr functions also accept "s", a pointer to a string and "len",
+ * the remaining length of the string. It tries to verify the whole string, and
+ * returns the number of input bytes (<= len) that are valid. If there is an
+ * encoding error, the return value is < len, and len points to the first invalid
+ * byte.
+ *
+ * The verifystr functions must test for and reject zeroes in the input.
  *-------------------------------------------------------------------
  */
-
 static int
-pg_ascii_verifier(const unsigned char *s, int len)
+pg_ascii_verifychar(const unsigned char *s, int len)
 {
 	return 1;
 }
 
+static int
+pg_ascii_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *nullpos = memchr(s, 0, len);
+
+	if (nullpos == NULL)
+		return len;
+	{
+		return nullpos - s;
+	}
+}
+
 #define IS_EUC_RANGE_VALID(c)	((c) >= 0xa1 && (c) <= 0xfe)
 
 static int
-pg_eucjp_verifier(const unsigned char *s, int len)
+pg_eucjp_verifychar(const unsigned char *s, int len)
 {
 	int			l;
 	unsigned char c1,
@@ -1164,7 +1182,36 @@ pg_eucjp_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_euckr_verifier(const unsigned char *s, int len)
+pg_eucjp_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_eucjp_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_euckr_verifychar(const unsigned char *s, int len)
 {
 	int			l;
 	unsigned char c1,
@@ -1192,11 +1239,41 @@ pg_euckr_verifier(const unsigned char *s, int len)
 	return l;
 }
 
+static int
+pg_euckr_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_euckr_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
 /* EUC-CN byte sequences are exactly same as EUC-KR */
-#define pg_euccn_verifier	pg_euckr_verifier
+#define pg_euccn_verifychar	pg_euckr_verifychar
+#define pg_euccn_verifystr	pg_euckr_verifystr
 
 static int
-pg_euctw_verifier(const unsigned char *s, int len)
+pg_euctw_verifychar(const unsigned char *s, int len)
 {
 	int			l;
 	unsigned char c1,
@@ -1246,7 +1323,36 @@ pg_euctw_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_johab_verifier(const unsigned char *s, int len)
+pg_euctw_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_euctw_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_johab_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1270,7 +1376,36 @@ pg_johab_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_mule_verifier(const unsigned char *s, int len)
+pg_johab_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_johab_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_mule_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1291,13 +1426,54 @@ pg_mule_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_latin1_verifier(const unsigned char *s, int len)
+pg_mule_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_mule_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_latin1_verifychar(const unsigned char *s, int len)
 {
 	return 1;
 }
 
 static int
-pg_sjis_verifier(const unsigned char *s, int len)
+pg_latin1_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *nullpos = memchr(s, 0, len);
+
+	if (nullpos == NULL)
+		return len;
+	{
+		return nullpos - s;
+	}
+}
+
+static int
+pg_sjis_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1320,7 +1496,36 @@ pg_sjis_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_big5_verifier(const unsigned char *s, int len)
+pg_sjis_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_sjis_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_big5_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1340,7 +1545,36 @@ pg_big5_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_gbk_verifier(const unsigned char *s, int len)
+pg_big5_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_big5_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_gbk_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1360,7 +1594,36 @@ pg_gbk_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_uhc_verifier(const unsigned char *s, int len)
+pg_gbk_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_gbk_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_uhc_verifychar(const unsigned char *s, int len)
 {
 	int			l,
 				mbl;
@@ -1380,7 +1643,36 @@ pg_uhc_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_gb18030_verifier(const unsigned char *s, int len)
+pg_uhc_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_uhc_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_gb18030_verifychar(const unsigned char *s, int len)
 {
 	int			l;
 
@@ -1411,11 +1703,55 @@ pg_gb18030_verifier(const unsigned char *s, int len)
 }
 
 static int
-pg_utf8_verifier(const unsigned char *s, int len)
+pg_gb18030_verifystr(const unsigned char *s, int len)
 {
-	int			l = pg_utf_mblen(s);
+	const unsigned char *start = s;
 
-	if (len < l)
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_gb18030_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
+static int
+pg_utf8_verifychar(const unsigned char *s, int len)
+{
+	int			l;
+
+	if ((*s & 0x80) == 0)
+	{
+		if (*s == '\0')
+			return -1;
+		return 1;
+	}
+	else if ((*s & 0xe0) == 0xc0)
+		l = 2;
+	else if ((*s & 0xf0) == 0xe0)
+		l = 3;
+	else if ((*s & 0xf8) == 0xf0)
+		l = 4;
+	else
+		l = 1;
+
+	if (l > len)
 		return -1;
 
 	if (!pg_utf8_islegal(s, l))
@@ -1424,6 +1760,35 @@ pg_utf8_verifier(const unsigned char *s, int len)
 	return l;
 }
 
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else
+		{
+			l = pg_utf8_verifychar(s, len);
+			if (l == -1)
+				break;
+		}
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
+
 /*
  * Check for validity of a single UTF-8 encoded character
  *
@@ -1503,48 +1868,48 @@ pg_utf8_islegal(const unsigned char *source, int length)
  *-------------------------------------------------------------------
  */
 const pg_wchar_tbl pg_wchar_table[] = {
-	{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
-	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},	/* PG_EUC_JP */
-	{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},	/* PG_EUC_CN */
-	{pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},	/* PG_EUC_KR */
-	{pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},	/* PG_EUC_TW */
-	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},	/* PG_EUC_JIS_2004 */
-	{pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},	/* PG_UTF8 */
-	{pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},	/* PG_MULE_INTERNAL */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
-	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
-	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
-	{0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
-	{0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},	/* PG_GBK */
-	{0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},	/* PG_UHC */
-	{0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},	/* PG_GB18030 */
-	{0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},	/* PG_JOHAB */
-	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}	/* PG_SHIFT_JIS_2004 */
+	{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, /* PG_SQL_ASCII */
+	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},	/* PG_EUC_JP */
+	{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},	/* PG_EUC_CN */
+	{pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},	/* PG_EUC_KR */
+	{pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},	/* PG_EUC_TW */
+	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},	/* PG_EUC_JIS_2004 */
+	{pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},	/* PG_UTF8 */
+	{pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},	/* PG_MULE_INTERNAL */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN1 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN2 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN3 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN4 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN5 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN6 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN7 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN8 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN9 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN10 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1256 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1258 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN866 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN874 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8R */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1251 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1252 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-5 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-6 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-7 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-8 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1250 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1253 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1254 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1255 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1257 */
+	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8U */
+	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, /* PG_SJIS */
+	{0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, /* PG_BIG5 */
+	{0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},	/* PG_GBK */
+	{0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},	/* PG_UHC */
+	{0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},	/* PG_GB18030 */
+	{0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},	/* PG_JOHAB */
+	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}	/* PG_SHIFT_JIS_2004 */
 };
 
 /*
@@ -1572,14 +1937,29 @@ pg_encoding_dsplen(int encoding, const char *mbstr)
 /*
  * Verify the first multibyte character of the given string.
  * Return its byte length if good, -1 if bad.  (See comments above for
- * full details of the mbverify API.)
+ * full details of the mbverifychar API.)
+ */
+int
+pg_encoding_verifymbchar(int encoding, const char *mbchar, int len)
+{
+	return (PG_VALID_ENCODING(encoding) ?
+			pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbchar, len) :
+			pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbchar, len));
+}
+
+/*
+ * Verify that a string is valid for the given encoding.
+ *
+ * Returns the number of input bytes (<= len) that form a valid string. If
+ * it equals 'len', the whole input is valid. Otherwise it is the index of
+ * the first invalid input byte.
  */
 int
-pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
 {
 	return (PG_VALID_ENCODING(encoding) ?
-			pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
-			pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
+			pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
+			pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
 }
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 494aefc7fab..549f2dd045d 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -371,7 +371,9 @@ typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
 
 typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
 
-typedef int (*mbverifier) (const unsigned char *mbstr, int len);
+typedef int (*mbchar_verifier) (const unsigned char *mbstr, int len);
+
+typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
 
 typedef struct
 {
@@ -381,7 +383,8 @@ typedef struct
 													 * to a multibyte */
 	mblen_converter mblen;		/* get byte length of a char */
 	mbdisplaylen_converter dsplen;	/* get display width of a char */
-	mbverifier	mbverify;		/* verify multibyte sequence */
+	mbchar_verifier	mbverifychar;	/* verify multibyte character */
+	mbstr_verifier	mbverifystr;	/* verify multibyte string */
 	int			maxmblen;		/* max bytes for a char in this encoding */
 } pg_wchar_tbl;
 
@@ -554,7 +557,8 @@ extern int	pg_valid_server_encoding_id(int encoding);
  */
 extern int	pg_encoding_mblen(int encoding, const char *mbstr);
 extern int	pg_encoding_dsplen(int encoding, const char *mbstr);
-extern int	pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+extern int	pg_encoding_verifymbchar(int encoding, const char *mbchar, int len);
+extern int	pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
 extern int	pg_encoding_max_length(int encoding);
 extern int	pg_valid_client_encoding(const char *name);
 extern int	pg_valid_server_encoding(const char *name);
-- 
2.20.1

