[POC] verifying UTF-8 using SIMD instructions

Started by John Nayloralmost 5 years ago80 messages

john.naylor@enterprisedb.com

almost 5 years ago

1 attachment(s)

Hi,

As of b80e10638e3, there is a new API for validating the encoding of
strings, and one of the side effects is that we have a wider choice of
algorithms. For UTF-8, it has been demonstrated that SIMD is much faster at
decoding [1]https://woboq.com/blog/utf-8-processing-using-simd.html and validation [2]https://lemire.me/blog/2020/10/20/ridiculously-fast-unicode-utf-8-validation/ than the standard approach we use.

It makes sense to start with the ascii subset of UTF-8 for a couple
reasons. First, ascii is very widespread in database content, particularly
in bulk loads. Second, ascii can be validated using the simple SSE2
intrinsics that come with (I believe) any x64-64 chip, and I'm guessing we
can detect that at compile time and not mess with runtime checks. The
examples above using SSE for the general case are much more complicated and
involve SSE 4.2 or AVX.

Here are some numbers on my laptop (MacOS/clang 10 -- if the concept is
okay, I'll do Linux/gcc and add more inputs). The test is the same as
Heikki shared in [3]/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi, but I added a case with >95% Chinese characters just
to show how that compares to the mixed ascii/multibyte case.

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

patch:

chinese | mixed | ascii
---------+-------+-------
1103 | 498 | 51

The speedup in the pure ascii case is nice.

In the attached POC, I just have a pro forma portability stub, and left
full portability detection for later. The fast path is inlined inside
pg_utf8_verifystr(). I imagine the ascii fast path could be abstracted into
a separate function to which is passed a function pointer for full encoding
validation. That would allow other encodings with strict ascii subsets to
use this as well, but coding that abstraction might be a little messy, and
b80e10638e3 already gives a performance boost over PG13.

I also gave a shot at doing full UTF-8 recognition using a DFA, but so far
that has made performance worse. If I ever have more success with that,
I'll add that in the mix.

[1]: https://woboq.com/blog/utf-8-processing-using-simd.html
[2]: https://lemire.me/blog/2020/10/20/ridiculously-fast-unicode-utf-8-validation/
https://lemire.me/blog/2020/10/20/ridiculously-fast-unicode-utf-8-validation/
[3]: /messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi
/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v1-verify-utf8-sse-ascii.patchapplication/x-patch; name=v1-verify-utf8-sse-ascii.patchDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..12b3a5e1a2 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,10 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bitutils.h"
+
+/* FIXME -- should go in src/include/port */
+#include <emmintrin.h>
 
 
 /*
@@ -1762,6 +1766,80 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
+#ifdef __x86_64__
+
+
+	const __m128i	zero = _mm_setzero_si128();
+	__m128i			chunk,
+					cmp;
+
+	const int		chunk_size = sizeof(__m128i);
+	int				zero_mask,
+					highbit_mask,
+					ascii_count,
+					remainder;
+
+	while (len >= chunk_size)
+	{
+		/* load next chunk */
+		chunk = _mm_loadu_si128((const __m128i *) s);
+
+		/* first detect any zero bytes */
+		cmp = _mm_cmpeq_epi8(chunk, zero);
+		zero_mask = _mm_movemask_epi8(cmp);
+
+		/* if there is a zero byte, let the slow path encounter it */
+		if (zero_mask)
+			break;
+
+		/* now check for non-ascii bytes */
+		highbit_mask = _mm_movemask_epi8(chunk);
+
+		if (!highbit_mask)
+		{
+			/* all ascii, so advance to the next chunk */
+			s += chunk_size;
+			len -= chunk_size;
+			continue;
+		}
+
+		/*
+		 * if not all ascii, maybe there is a solid block of ascii
+		 * at the beginning of the chunk. if so, skip it
+		 */
+		ascii_count = pg_rightmost_one_pos32(highbit_mask);
+
+		s += ascii_count;
+		len -= ascii_count;
+		remainder = chunk_size - ascii_count;
+
+		/* found non-ascii, so handle the remainder in the normal way */
+		while (remainder > 0)
+		{
+			int			l;
+
+			/*
+			 * fast path for ASCII-subset characters
+			 * we already know they're non-zero
+			 */
+			if (!IS_HIGHBIT_SET(*s))
+				l = 1;
+			else
+			{
+				l = pg_utf8_verifychar(s, len);
+				if (l == -1)
+					goto finish;
+			}
+			s += l;
+			len -= l;
+			remainder -= l;
+
+		}
+	}
+
+#endif							/* __x86_64__ */
+
+	/* handle last few bytes */
 	while (len > 0)
 	{
 		int			l;
@@ -1770,19 +1848,20 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-				break;
+				goto finish;
 			l = 1;
 		}
 		else
 		{
 			l = pg_utf8_verifychar(s, len);
 			if (l == -1)
-				break;
+				goto finish;
 		}
 		s += l;
 		len -= l;
 	}
 
+finish:
 	return s - start;
 }

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#1)

Re: [POC] verifying UTF-8 using SIMD instructions

On 01/02/2021 19:32, John Naylor wrote:

It makes sense to start with the ascii subset of UTF-8 for a couple
reasons. First, ascii is very widespread in database content,
particularly in bulk loads. Second, ascii can be validated using the
simple SSE2 intrinsics that come with (I believe) any x64-64 chip, and
I'm guessing we can detect that at compile time and not mess with
runtime checks. The examples above using SSE for the general case are
much more complicated and involve SSE 4.2 or AVX.

I wonder how using SSE compares with dealing with 64 or 32-bit words at
a time, using regular instructions? That would be more portable.

Here are some numbers on my laptop (MacOS/clang 10 -- if the concept is
okay, I'll do Linux/gcc and add more inputs). The test is the same as
Heikki shared in [3], but I added a case with >95% Chinese characters
just to show how that compares to the mixed ascii/multibyte case.

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

patch:

chinese | mixed | ascii
---------+-------+-------
1103 | 498 | 51

The speedup in the pure ascii case is nice.

Yep.

In the attached POC, I just have a pro forma portability stub, and left
full portability detection for later. The fast path is inlined inside
pg_utf8_verifystr(). I imagine the ascii fast path could be abstracted
into a separate function to which is passed a function pointer for full
encoding validation. That would allow other encodings with strict ascii
subsets to use this as well, but coding that abstraction might be a
little messy, and b80e10638e3 already gives a performance boost over PG13.

All supported encodings are ASCII subsets. Might be best to putt the
ASCII-check into a static inline function and use it in all the verify
functions. I presume it's only a few instructions, and these functions
can be pretty performance sensitive.

I also gave a shot at doing full UTF-8 recognition using a DFA, but so
far that has made performance worse. If I ever have more success with
that, I'll add that in the mix.

That's disappointing. Perhaps the SIMD algorithms have higher startup
costs, so that you need longer inputs to benefit? In that case, it might
make sense to check the length of the input and only use the SIMD
algorithm if the input is long enough.

- Heikki

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#2)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 1, 2021 at 2:01 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

On 01/02/2021 19:32, John Naylor wrote:

It makes sense to start with the ascii subset of UTF-8 for a couple
reasons. First, ascii is very widespread in database content,
particularly in bulk loads. Second, ascii can be validated using the
simple SSE2 intrinsics that come with (I believe) any x64-64 chip, and
I'm guessing we can detect that at compile time and not mess with
runtime checks. The examples above using SSE for the general case are
much more complicated and involve SSE 4.2 or AVX.

I wonder how using SSE compares with dealing with 64 or 32-bit words at
a time, using regular instructions? That would be more portable.

I gave that a shot, and it's actually pretty good. According to this paper,
[1]: https://arxiv.org/abs/2010.03090
registers, so I tried both 16 and 8 bytes.

All supported encodings are ASCII subsets. Might be best to putt the
ASCII-check into a static inline function and use it in all the verify
functions. I presume it's only a few instructions, and these functions
can be pretty performance sensitive.

I tried both the static inline function and also putting the whole
optimized utf-8 loop in a separate function to which the caller passes a
pointer to the appropriate pg_*_verifychar().

In the table below, "inline" refers to coding directly inside
pg_utf8_verifystr(). Both C and SSE are in the same patch, with an #ifdef.
I didn't bother splitting them out because for other encodings, we want one
of the other approaches above. For those, "C retail" refers to a static
inline function to code the contents of the inner loop, if I understood
your suggestion correctly. This needs more boilerplate in each function, so
I don't prefer this. "C func pointer" refers to the pointer approach I just
mentioned. That is the cleanest looking way to generalize it, so I only
tested that version with different strides -- 8- and 16-bytes

This is the same test I used earlier, which is the test in [2]/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi but adding
an almost-pure multibyte Chinese text of about the same size.

x64-64 Linux gcc 8.4.0:

build | chinese | mixed | ascii
------------------+---------+-------+-------
master | 1480 | 848 | 428
inline SSE | 1617 | 634 | 63
inline C | 1481 | 843 | 50
C retail | 1493 | 838 | 49
C func pointer | 1467 | 851 | 49
C func pointer 8 | 1518 | 757 | 56

x64-64 MacOS clang 10.0.0:

build | chinese | mixed | ascii
------------------+---------+-------+-------
master | 1086 | 760 | 374
inline SSE | 1081 | 529 | 70
inline C | 1093 | 649 | 49
C retail | 1132 | 695 | 152
C func pointer | 1085 | 609 | 59
C func pointer 8 | 1099 | 571 | 71

PowerPC-LE Linux gcc 4.8.5:

build | chinese | mixed | ascii
------------------+---------+-------+-------
master | 2961 | 1525 | 871
inline SSE | (n/a) | (n/a) | (n/a)
inline C | 2911 | 1329 | 80
C retail | 2838 | 1311 | 102
C func pointer | 2828 | 1314 | 80
C func pointer 8 | 3143 | 1249 | 133

Looking at the results, the main advantage of SSE here is it's more robust
for mixed inputs. If a 16-byte chunk is not ascii-only but contains a block
of ascii at the front, we can skip those with a single CPU instruction, but
in C, we have to verify the whole chunk using the slow path.

The "C func pointer approach" seems to win out over the "C retail" approach
(static inline function).

Using an 8-byte stride is slightly better for mixed inputs on all platforms
tested, but regresses on pure ascii and also seems to regress on pure
multibyte. The difference in the multibyte caes is small enough that it
could be random, but it happens on two platforms, so I'd say it's real. On
the other hand, pure multibyte is not as common as mixed text.

Overall, I think the function pointer approach with an 8-byte stride is the
best balance. If that's agreeable, next I plan to test with short inputs,
because I think we'll want a guard if-statement to only loop through the
fast path if the string is long enough to justify that.

I also gave a shot at doing full UTF-8 recognition using a DFA, but so
far that has made performance worse. If I ever have more success with
that, I'll add that in the mix.

That's disappointing. Perhaps the SIMD algorithms have higher startup
costs, so that you need longer inputs to benefit? In that case, it might
make sense to check the length of the input and only use the SIMD
algorithm if the input is long enough.

I changed topics a bit quickly, but here I'm talking about using a
table-driven state machine to verify the multibyte case. It's possible I
did something wrong, since my model implementation decodes, and having to
keep track of how many bytes got verified might be the culprit. I'd like to
try again to speed up multibyte, but that might be a PG15 project.

[1]: https://arxiv.org/abs/2010.03090
[2]: /messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi
/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi

--
John Naylor
EDB: http://www.enterprisedb.com

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#3)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

Here is a more polished version of the function pointer approach, now
adapted to all multibyte encodings. Using the not-yet-committed tests from
[1]: /messages/by-id/11d39e63-b80a-5f8d-8043-fff04201fadc@iki.fi
only be wrong, but probably also elided by the compiler. Doing it correctly
is noticeably slower on pure ascii, but still several times faster than
before, so the conclusions haven't changed any. I'll run full measurements
later this week, but I'll share the patch now for review.

[1]: /messages/by-id/11d39e63-b80a-5f8d-8043-fff04201fadc@iki.fi
/messages/by-id/11d39e63-b80a-5f8d-8043-fff04201fadc@iki.fi

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patchapplication/octet-stream; name=v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patchDownload

From ecce09c9184e09e59992aa146192f62557f1da8e Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 7 Feb 2021 15:30:10 -0400
Subject: [PATCH v1] Add an ASCII fast path to multibyte encoding verification
 functions.

A large amount of database input consists of ASCII, regardless of locale,
so it makes sense to optimize for this. Using bitwise operations, verifying
the pure ASCII subset of a multibyte encoding is now several times faster.
Verifying pure multibyte strings is slightly slower, but that's not common.
ASCII interspersed with sections of multibyte text is more typical, and
that speeds up around 10%, depending on platform.
---
 src/common/wchar.c | 181 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 159 insertions(+), 22 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..0bd3175171 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,10 @@
 #include "mb/pg_wchar.h"
 
 
+#define ASCII_STRIDE_LENGTH sizeof(uint64)
+
+typedef int (*verifychar_func) (const unsigned char *s, int len);
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1105,6 +1109,62 @@ pg_gb18030_dsplen(const unsigned char *s)
  * single-byte encoding to be just "return 1".
  *-------------------------------------------------------------------
  */
+
+/*
+ * Provides a fast path for verifying ASCII bytes found in a string
+ * that is encoded in a multibyte encoding. The verifychar parameter
+ * is the corresponding pg_*_verifychar function.
+ */
+static int
+pg_verifystr_fast(const unsigned char *s, int len, verifychar_func verifychar)
+{
+	const unsigned char *start = s;
+	int			remainder;
+	uint64		chunk,
+				highbit_mask;
+
+	/* If there are zero bytes, bail and let the slow path handle it. */
+	const unsigned char *nullpos = memchr(s, 0, len);
+	if (nullpos != NULL)
+		return 0;
+
+	while (len >= ASCII_STRIDE_LENGTH)
+	{
+		chunk = *((uint64 *) s);
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = (chunk & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+		{
+			/* All ASCII, so advance to the next chunk. */
+			s += ASCII_STRIDE_LENGTH;
+			len -= ASCII_STRIDE_LENGTH;
+			continue;
+		}
+
+		/*
+		 * Found non-ASCII in this chunk, so handle using the given
+		 * pg_*_verifychar() function.
+		 */
+		remainder = ASCII_STRIDE_LENGTH;
+		while (remainder > 0)
+		{
+			int			l;
+
+			l = (*verifychar)(s, len);
+			if (l == -1)
+				return s - start;
+
+			s += l;
+			len -= l;
+			remainder -= l;
+		}
+	}
+
+	return s - start;
+}
+
 static int
 pg_ascii_verifychar(const unsigned char *s, int len)
 {
@@ -1183,11 +1243,18 @@ static int
 pg_eucjp_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_eucjp_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1241,11 +1308,18 @@ static int
 pg_euckr_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_euckr_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1324,11 +1398,18 @@ static int
 pg_euctw_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_euctw_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1377,11 +1458,18 @@ static int
 pg_johab_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_johab_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1427,11 +1515,18 @@ static int
 pg_mule_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_mule_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1496,11 +1591,18 @@ static int
 pg_sjis_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_sjis_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1545,11 +1647,18 @@ static int
 pg_big5_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_big5_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1594,11 +1703,18 @@ static int
 pg_gbk_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_gbk_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1643,11 +1759,18 @@ static int
 pg_uhc_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_uhc_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1703,11 +1826,18 @@ static int
 pg_gb18030_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_gb18030_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1761,11 +1891,18 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int				l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_utf8_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
-- 
2.22.0

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#4)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On 07/02/2021 22:24, John Naylor wrote:

Here is a more polished version of the function pointer approach, now
adapted to all multibyte encodings. Using the not-yet-committed tests
from [1], I found a thinko bug that resulted in the test for nul bytes
to not only be wrong, but probably also elided by the compiler. Doing it
correctly is noticeably slower on pure ascii, but still several times
faster than before, so the conclusions haven't changed any. I'll run
full measurements later this week, but I'll share the patch now for review.

As a quick test, I hacked up pg_utf8_verifystr() to use Lemire's
algorithm from the simdjson library [1]https://github.com/simdjson/simdjson, see attached patch. I
microbenchmarked it using the the same test I used before [2]/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi.

These results are with "gcc -O2" using "gcc (Debian 10.2.1-6) 10.2.1
20210110"

unpatched master:

postgres=# \i mbverifystr-speed.sql
CREATE FUNCTION
mixed | ascii
-------+-------
728 | 393
(1 row)

v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patch:

mixed | ascii
-------+-------
759 | 98
(1 row)

simdjson-utf8-hack.patch:

mixed | ascii
-------+-------
53 | 31
(1 row)

So clearly that algorithm is fast. Not sure if it has a high startup
cost, or large code size, or other tradeoffs that we don't want. At
least it depends on SIMD instructions, so it requires more code for the
architecture-specific implementations and autoconf logic and all that.
Nevertheless I think it deserves a closer look, I'm a bit reluctant to
put in half-way measures, when there's a clearly superior algorithm out
there.

I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):

mixed | ascii
-------+-------
447 | 46
(1 row)

I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.

[1]: https://github.com/simdjson/simdjson
[2]: /messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi
/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi

- Heikki

PS. Your patch as it stands isn't safe on systems with strict alignment,
the string passed to the verify function isn't guaranteed to be 8 bytes
aligned. Use memcpy to fetch the next 8-byte chunk to fix.

Attachments:

simdjson-utf8-hack.patchtext/x-patch; charset=UTF-8; name=simdjson-utf8-hack.patchDownload

diff --git a/src/backend/Makefile b/src/backend/Makefile
index 9672e2cb43a..aa79526b0cf 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -54,6 +54,9 @@ ifeq ($(with_systemd),yes)
 LIBS += -lsystemd
 endif
 
+LIBS += -lsimdjson
+
+
 ##########################################################################
 
 all: submake-libpgport submake-catalog-headers submake-utils-headers postgres $(POSTGRES_IMP)
@@ -63,7 +66,7 @@ ifneq ($(PORTNAME), win32)
 ifneq ($(PORTNAME), aix)
 
 postgres: $(OBJS)
-	$(CC) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
+	$(CXX) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
 
 endif
 endif
diff --git a/src/common/Makefile b/src/common/Makefile
index 5422579a6a2..f3e4d985062 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -92,6 +92,9 @@ OBJS_COMMON += \
 	sha2.o
 endif
 
+simdjson-glue_srv.o:
+	$(CXX) $(CXXFLAGS) -c simdjson-glue.c -o simdjson-glue_srv.o
+
 # A few files are currently only built for frontend, not server
 # (Mkvcbuild.pm has a copy of this list, too).  logging.c is excluded
 # from OBJS_FRONTEND_SHLIB (shared library) as a matter of policy,
@@ -110,6 +113,9 @@ OBJS_FRONTEND = \
 OBJS_SHLIB = $(OBJS_FRONTEND_SHLIB:%.o=%_shlib.o)
 OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
 
+# Only use the simdjson version in the server
+OBJS_SRV += simdjson-glue_srv.o
+
 # where to find gen_keywordlist.pl and subsidiary files
 TOOLSDIR = $(top_srcdir)/src/tools
 GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
diff --git a/src/common/simdjson-glue.c b/src/common/simdjson-glue.c
new file mode 100644
index 00000000000..9c300bde022
--- /dev/null
+++ b/src/common/simdjson-glue.c
@@ -0,0 +1,90 @@
+#include <simdjson.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Heikki: This is copy-pasted from simdjson library
+// src/fallback/dom_parser_implementation.cpp
+//
+// Note: Apache licensed!
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+static bool fallback_validate_utf8(const char *buf, size_t len)
+{
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+
+extern "C" {
+
+bool
+simdjson_validate_utf8(const char *s, int len)
+{
+
+	// uncomment to test simdjson's fallback implementation
+	//return fallback_validate_utf8(s, len);
+
+	return simdjson::validate_utf8(s, len);
+
+}
+
+}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e020..0c77f83ba01 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1758,7 +1758,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 }
 
 static int
-pg_utf8_verifystr(const unsigned char *s, int len)
+pg_utf8_verifystr_slow(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
@@ -1786,6 +1786,23 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+
+
+#ifndef FRONTEND
+extern bool simdjson_validate_utf8(const char *s, int len);
+#endif
+
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+#ifndef FRONTEND
+	if (true && simdjson_validate_utf8((const char *) s, len))
+		return len;
+	else
+#endif
+		return pg_utf8_verifystr_slow(s, len);
+}
+
 /*
  * Check for validity of a single UTF-8 encoded character
  *

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#5)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 8, 2021 at 6:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

As a quick test, I hacked up pg_utf8_verifystr() to use Lemire's
algorithm from the simdjson library [1], see attached patch. I
microbenchmarked it using the the same test I used before [2].

I've been looking at various iterations of Lemire's utf8 code, and trying
it out was next on my list, so thanks for doing that!

These results are with "gcc -O2" using "gcc (Debian 10.2.1-6) 10.2.1
20210110"

unpatched master:

postgres=# \i mbverifystr-speed.sql
CREATE FUNCTION
mixed | ascii
-------+-------
728 | 393
(1 row)

v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patch:

mixed | ascii
-------+-------
759 | 98
(1 row)

Hmm, the mixed case got worse -- I haven't seen that in any of my tests.

simdjson-utf8-hack.patch:

mixed | ascii
-------+-------
53 | 31
(1 row)

So clearly that algorithm is fast. Not sure if it has a high startup
cost, or large code size, or other tradeoffs that we don't want.

The simdjson lib uses everything up through AVX512 depending on what
hardware is available. I seem to remember reading that high start-up cost
is more relevant to floating point than to integer ops, but I could be
wrong. Just the utf8 portion is surely tiny also.

At
least it depends on SIMD instructions, so it requires more code for the
architecture-specific implementations and autoconf logic and all that.

One of his earlier demos [1]https://github.com/lemire/fastvalidate-utf-8/tree/master/include (in simdutf8check.h) had a version that used
mostly SSE2 with just three intrinsics from SSSE3. That's widely available
by now. He measured that at 0.7 cycles per byte, which is still good
compared to AVX2 0.45 cycles per byte [2]https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/.

Testing for three SSSE3 intrinsics in autoconf is pretty easy. I would
assume that if that check (and the corresponding runtime check) passes, we
can assume SSE2. That code has three licenses to choose from -- Apache 2,
Boost, and MIT. Something like that might be straightforward to start
from. I think the only obstacles to worry about are license and getting it
to fit into our codebase. Adding more than zero high-level comments with a
good description of how it works in detail is also a bit of a challenge.

I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):

mixed | ascii
-------+-------
447 | 46
(1 row)

I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.

Okay, I'll look into that.

PS. Your patch as it stands isn't safe on systems with strict alignment,
the string passed to the verify function isn't guaranteed to be 8 bytes
aligned. Use memcpy to fetch the next 8-byte chunk to fix.

Will do.

[1]: https://github.com/lemire/fastvalidate-utf-8/tree/master/include
[2]: https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/
https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/

--
John Naylor
EDB: http://www.enterprisedb.com

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#5)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 8, 2021 at 6:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):

mixed | ascii
-------+-------
447 | 46
(1 row)

I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.

That fallback is very similar to my "inline C" case upthread, and they both
actually check 16 bytes at a time (the comment is wrong in the patch you
shared). I can work back and show how the performance changes with each
difference (just MacOS, clang 10 here):

master

mixed | ascii
-------+-------
757 | 366

v1, but using memcpy()

mixed | ascii
-------+-------
601 | 129

remove zero-byte check:

mixed | ascii
-------+-------
588 | 93

inline ascii fastpath into pg_utf8_verifystr()

mixed | ascii
-------+-------
595 | 71

use 16-byte stride

mixed | ascii
-------+-------
652 | 49

With this cpu/compiler, v1 is fastest on the mixed input all else being
equal.

Maybe there's a smarter way to check for zeros in C. Or maybe be more
careful about cache -- running memchr() on the whole input first might not
be the best thing to do.

--
John Naylor
EDB: http://www.enterprisedb.com

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#7)

Re: [POC] verifying UTF-8 using SIMD instructions

On 09/02/2021 22:08, John Naylor wrote:

Maybe there's a smarter way to check for zeros in C. Or maybe be more
careful about cache -- running memchr() on the whole input first might
not be the best thing to do.

The usual trick is the haszero() macro here:
https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord. That's
how memchr() is typically implemented, too.

- Heikki

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#6)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

I wrote:

On Mon, Feb 8, 2021 at 6:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:
One of his earlier demos [1] (in simdutf8check.h) had a version that used

mostly SSE2 with just three intrinsics from SSSE3. That's widely available
by now. He measured that at 0.7 cycles per byte, which is still good
compared to AVX2 0.45 cycles per byte [2].

Testing for three SSSE3 intrinsics in autoconf is pretty easy. I would

assume that if that check (and the corresponding runtime check) passes, we
can assume SSE2. That code has three licenses to choose from -- Apache 2,
Boost, and MIT. Something like that might be straightforward to start from.
I think the only obstacles to worry about are license and getting it to fit
into our codebase. Adding more than zero high-level comments with a good
description of how it works in detail is also a bit of a challenge.

I double checked, and it's actually two SSSE3 intrinsics and one SSE4.1,
but the 4.1 one can be emulated with a few SSE2 intrinsics. But we could
probably fold all three into the SSE4.2 CRC check and have a single symbol
to save on boilerplate.

I hacked that demo [1]https://github.com/lemire/fastvalidate-utf-8/tree/master/include into wchar.c (very ugly patch attached), and got the
following:

master

mixed | ascii
-------+-------
757 | 366

Lemire demo:

mixed | ascii
-------+-------
172 | 168

This one lacks an ascii fast path, but the AVX2 version in the same file
has one that could probably be easily adapted. With that, I think this
would be worth adapting to our codebase and license. Thoughts?

The advantage of this demo is that it's not buried in a mountain of modern
C++.

Simdjson can use AVX -- do you happen to know which target it got compiled
to? AVX vectors are 256-bits wide and that requires OS support. The OS's we
care most about were updated 8-12 years ago, but that would still be
something to check, in addition to more configure checks.

[1]: https://github.com/lemire/fastvalidate-utf-8/tree/master/include

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

utf-sse42-demo.patchapplication/octet-stream; name=utf-sse42-demo.patchDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..1f618658a1 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include <x86intrin.h>
 
 
 /*
@@ -1757,11 +1758,184 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+// all byte values must be no larger than 0xF4
+static inline void checkSmallerThan0xF4(__m128i current_bytes,
+										__m128i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm_or_si128(*has_error,
+							_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
+}
+
+static inline __m128i continuationLengths(__m128i high_nibbles) {
+  return _mm_shuffle_epi8(
+	  _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+					0, 0, 0, 0,			 // 10xx (continuation)
+					2, 2,				   // 110x
+					3,					  // 1110
+					4), // 1111, next should be 0 (not checked here)
+	  high_nibbles);
+}
+
+static inline __m128i carryContinuations(__m128i initial_lengths,
+										 __m128i previous_carries) {
+
+  __m128i right1 =
+	  _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
+					_mm_set1_epi8(1));
+  __m128i sum = _mm_add_epi8(initial_lengths, right1);
+
+  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
+								 _mm_set1_epi8(2));
+  return _mm_add_epi8(sum, right2);
+}
+
+static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
+									  __m128i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m128i overunder =
+	  _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
+					 _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
+
+  *has_error = _mm_or_si128(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void checkFirstContinuationMax(__m128i current_bytes,
+											 __m128i off1_current_bytes,
+											 __m128i *has_error) {
+  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
+  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
+
+  __m128i badfollowED =
+	  _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
+  __m128i badfollowF4 =
+	  _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
+
+  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits	 off1	cur
+// C	   => < C2 && true
+// E	   => < E1 && < A0
+// F	   => < F1 && < 90
+// else	  false && false
+static inline void checkOverlong(__m128i current_bytes,
+								 __m128i off1_current_bytes, __m128i hibits,
+								 __m128i previous_hibits, __m128i *has_error) {
+  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
+  __m128i initial_mins = _mm_shuffle_epi8(
+	  _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+					-128, -128, // 10xx => false
+					0xC2, -128, // 110x
+					0xE1,	   // 1110
+					0xF1),
+	  off1_hibits);
+
+  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m128i second_mins = _mm_shuffle_epi8(
+	  _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+					-128, -128, // 10xx => false
+					127, 127,   // 110x => true
+					0xA0,	   // 1110
+					0x90),
+	  off1_hibits);
+  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
+  *has_error =
+	  _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+}
+
+struct processed_utf_bytes {
+  __m128i rawbytes;
+  __m128i high_nibbles;
+  __m128i carried_continuations;
+};
+
+static inline void count_nibbles(__m128i bytes,
+								 struct processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+	  _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct processed_utf_bytes
+checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
+			   __m128i *has_error) {
+  struct processed_utf_bytes pb;
+  count_nibbles(current_bytes, &pb);
+
+  checkSmallerThan0xF4(current_bytes, has_error);
+
+  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+	  carryContinuations(initial_lengths, previous->carried_continuations);
+
+  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m128i off1_current_bytes =
+	  _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
+  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+				previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast(const char *src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
+										 .high_nibbles = _mm_setzero_si128(),
+										 .carried_continuations =
+											 _mm_setzero_si128()};
+  if (len >= 16) {
+	for (; i <= len - 16; i += 16) {
+	  __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+	  previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+	}
+  }
+
+  // last part
+  if (i < len) {
+	char buffer[16];
+	memset(buffer, 0, 16);
+	memcpy(buffer, src + i, len - i);
+	__m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
+	previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+	has_error =
+		_mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
+									_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+												  9, 9, 9, 9, 9, 1)),
+					 has_error);
+  }
+
+  return _mm_testz_si128(has_error, has_error);
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
+	const unsigned char *nullpos = memchr(s, 0, len);
+	if (nullpos == NULL)
+	{
+
+		bool allvalid = validate_utf8_fast((const char *) s, len);
+		if (allvalid)
+			return len;
+	}
+
 	while (len > 0)
 	{
 		int			l;

#10

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#8)

Re: [POC] verifying UTF-8 using SIMD instructions

On Tue, Feb 9, 2021 at 4:22 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

On 09/02/2021 22:08, John Naylor wrote:

Maybe there's a smarter way to check for zeros in C. Or maybe be more
careful about cache -- running memchr() on the whole input first might
not be the best thing to do.

The usual trick is the haszero() macro here:
https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord. That's
how memchr() is typically implemented, too.

Thanks for that. Checking with that macro each loop iteration gives a small
boost:

v1, but using memcpy()

mixed | ascii
-------+-------
601 | 129

with haszero()

mixed | ascii
-------+-------
583 | 105

remove zero-byte check:

mixed | ascii
-------+-------
588 | 93

--
John Naylor
EDB: http://www.enterprisedb.com

#11

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#5)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 8, 2021 at 6:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):

mixed | ascii
-------+-------
447 | 46
(1 row)

I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.

Attached is a patch that does roughly what simdjson fallback did, except I
use straight tests on the bytes and only calculate code points in assertion
builds. In the course of doing this, I found that my earlier concerns about
putting the ascii check in a static inline function were due to my
suboptimal loop implementation. I had assumed that if the chunked ascii
check failed, it had to check all those bytes one at a time. As it turns
out, that's a waste of the branch predictor. In the v2 patch, we do the
chunked ascii check every time we loop. With that, I can also confirm the
claim in the Lemire paper that it's better to do the check on 16-byte
chunks:

(MacOS, Clang 10)

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

v2 patch, with 16-byte stride:

chinese | mixed | ascii
---------+-------+-------
806 | 474 | 83

patch but with 8-byte stride:

chinese | mixed | ascii
---------+-------+-------
792 | 490 | 105

I also included the fast path in all other multibyte encodings, and that is
also pretty good performance-wise. It regresses from master on pure
multibyte input, but that case is still faster than PG13, which I simulated
by reverting 6c5576075b0f9 and b80e10638e3:

~PG13:

chinese | mixed | ascii
---------+-------+-------
1565 | 848 | 365

ascii fast-path plus pg_*_verifychar():

chinese | mixed | ascii
---------+-------+-------
1279 | 656 | 94

v2 has a rough start to having multiple implementations in
src/backend/port. Next steps are:

1. Add more tests for utf-8 coverage (in addition to the ones to be added
by the noError argument patch)
2. Add SSE4 validator -- it turns out the demo I referred to earlier
doesn't match the algorithm in the paper. I plan to only copy the lookup
tables from simdjson verbatim, but the code will basically be written from
scratch, using simdjson as a hint.
3. Adjust configure.ac

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v2-add-portability-stub-and-new-fallback.patchapplication/octet-stream; name=v2-add-portability-stub-and-new-fallback.patchDownload

 src/common/wchar.c          | 116 ++++++++++++++++++++++++++-------
 src/include/port/pg_utf8.h  |  74 +++++++++++++++++++++
 src/port/Makefile           |  13 +++-
 src/port/pg_utf8_choose.c   |  65 +++++++++++++++++++
 src/port/pg_utf8_fallback.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 src/port/pg_utf8_sse42.c    |  29 +++++++++
 6 files changed, 425 insertions(+), 25 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..742957e67e 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1760,30 +1851,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	return pg_validate_utf8(s, len);
 }
 
 /*
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..b0e0939e43
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_CRC32C)
+/* Use Intel SSE4.2 instructions. */
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+extern int (*pg_validate_utf8) (const unsigned char *s, int len);
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#else
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_CRC32C */
+
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1, half2,
+				highbit_mask;
+
+	if  (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/*
+		 * If there are any zero bytes, bail and let the slow
+		 * path handle it.
+		 */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..bd33d500c5 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -59,7 +59,13 @@ OBJS = \
 	snprintf.o \
 	strerror.o \
 	tar.o \
-	thread.o
+	thread.o \
+	pg_utf8_sse42.o \
+	pg_utf8_fallback.o \
+	pg_utf8_choose.o
+
+# FIXME --^
+# we need something like $(PG_SSE42_OBJS)
 
 # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files
 # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c
@@ -88,6 +94,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_choose.c b/src/port/pg_utf8_choose.c
new file mode 100644
index 0000000000..4dd80c2189
--- /dev/null
+++ b/src/port/pg_utf8_choose.c
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		//pg_validate_utf8 = pg_validate_utf8_sse42;
+		pg_validate_utf8 = pg_validate_utf8_fallback; //  FIXME
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..113534c2ec
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,153 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1, b2, b3, b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else if ((*s & 0b11100000) == 0b11000000)
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00011111) << 6 |
+								(b2 & 0b00111111);
+
+			Assert(code_point >= 0x80 && code_point <= 0x7FF);
+#endif
+		}
+		else if ((*s & 0b11110000) == 0b11100000)
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00001111) << 12 |
+								(b2 & 0b00111111) << 6 |
+								(b3 & 0b00111111);
+
+			Assert((code_point >= 0x0800 && code_point <= 0xD7FF) ||
+				   (code_point >= 0xE000 && code_point <= 0xFFFF));
+#endif
+		}
+		else if ((*s & 0b11111000) == 0b11110000)
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong:
+			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+
+			/*
+			 * check too large:
+			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00000111) << 18 |
+								(b2 & 0b00111111) << 12 |
+								(b3 & 0b00111111) << 6 |
+								(b4 & 0b00111111);
+
+			Assert(code_point >= 0x010000 && code_point <= 0x10FFFF);
+#endif
+		}
+		else
+			/* We may have a bare continuation or large byte. */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..30bd9769b6
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,29 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
+
+// TODO
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	Assert(0);
+	return 0;
+}

#12

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#11)

Re: [POC] verifying UTF-8 using SIMD instructions

On 13/02/2021 03:31, John Naylor wrote:

On Mon, Feb 8, 2021 at 6:17 AM Heikki Linnakangas <hlinnaka@iki.fi
<mailto:hlinnaka@iki.fi>> wrote:

I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):

mixed | ascii
-------+-------
447 | 46
(1 row)

I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.

Attached is a patch that does roughly what simdjson fallback did, except
I use straight tests on the bytes and only calculate code points in
assertion builds. In the course of doing this, I found that my earlier
concerns about putting the ascii check in a static inline function were
due to my suboptimal loop implementation. I had assumed that if the
chunked ascii check failed, it had to check all those bytes one at a
time. As it turns out, that's a waste of the branch predictor. In the v2
patch, we do the chunked ascii check every time we loop. With that, I
can also confirm the claim in the Lemire paper that it's better to do
the check on 16-byte chunks:

(MacOS, Clang 10)

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

v2 patch, with 16-byte stride:

chinese | mixed | ascii
---------+-------+-------
806 | 474 | 83

patch but with 8-byte stride:

chinese | mixed | ascii
---------+-------+-------
792 | 490 | 105

I also included the fast path in all other multibyte encodings, and that
is also pretty good performance-wise.

Cool.

It regresses from master on pure
multibyte input, but that case is still faster than PG13, which I
simulated by reverting 6c5576075b0f9 and b80e10638e3:

I thought the "chinese" numbers above are pure multibyte input, and it
seems to do well on that. Where does it regress? In multibyte encodings
other than UTF-8? How bad is the regression?

I tested this on my first generation Raspberry Pi (chipmunk). I had to
tweak it a bit to make it compile, since the SSE autodetection code was
not finished yet. And I used generate_series(1, 1000) instead of
generate_series(1, 10000) in the test script (mbverifystr-speed.sql)
because this system is so slow.

master:

mixed | ascii
-------+-------
1310 | 1041
(1 row)

v2-add-portability-stub-and-new-fallback.patch:

mixed | ascii
-------+-------
2979 | 910
(1 row)

I'm guessing that's because the unaligned access in check_ascii() is
expensive on this platform.

- Heikki

#13

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#12)

2 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 15, 2021 at 9:18 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

Attached is the first attempt at using SSE4 to do the validation, but first
I'll answer your questions about the fallback.

I should mention that v2 had a correctness bug for 4-byte characters that I
found when I was writing regression tests. It shouldn't materially affect
performance, however.

I thought the "chinese" numbers above are pure multibyte input, and it
seems to do well on that. Where does it regress? In multibyte encodings
other than UTF-8?

Yes, the second set of measurements was intended to represent multibyte
encodings other than UTF-8. But instead of using one of those encodings, I
simulated non-UTF-8 by copying the pattern used for those: in the loop,
check for ascii then either advance or verify one character. It was a quick
way to use the same test.

How bad is the regression?

I'll copy the measurements here together with master so it's easier to
compare:

~= PG13 (revert 6c5576075b0f9 and b80e10638e3):

chinese | mixed | ascii
---------+-------+-------
1565 | 848 | 365

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

ascii fast-path plus pg_*_verifychar():

chinese | mixed | ascii
---------+-------+-------
1279 | 656 | 94

As I mentioned upthread, pure multibyte is still faster than PG13. Reducing
the ascii check to 8-bytes at time might alleviate the regression.

I tested this on my first generation Raspberry Pi (chipmunk). I had to
tweak it a bit to make it compile, since the SSE autodetection code was
not finished yet. And I used generate_series(1, 1000) instead of
generate_series(1, 10000) in the test script (mbverifystr-speed.sql)
because this system is so slow.

master:

mixed | ascii
-------+-------
1310 | 1041
(1 row)

v2-add-portability-stub-and-new-fallback.patch:

mixed | ascii
-------+-------
2979 | 910
(1 row)

I'm guessing that's because the unaligned access in check_ascii() is
expensive on this platform.

Hmm, I used memcpy() as suggested. Is that still slow on that platform?
That's 32-bit, right? Some possible remedies:

1) For the COPY FROM case, we should align the allocation on a cacheline --
we already have examples of that idiom elsewhere. I was actually going to
suggest doing this anyway, since unaligned SIMD loads are often slower, too.

2) As the simdjson fallback was based on Fuchsia (the Lemire paper implies
it was tested carefully on Arm and I have no reason to doubt that), I could
try to follow that example more faithfully by computing the actual
codepoints. It's more computation and just as many branches as far as I can
tell, but it's not a lot of work. I can add that alternative fallback to
the patch set. I have no Arm machines, but I can test on a POWER8 machine.

3) #ifdef out the ascii check for 32-bit platforms.

4) Same as the non-UTF8 case -- only check for ascii 8 bytes at a time.
I'll probably try this first.

Now, I'm pleased to report that I got SSE4 working, and it seems to work.
It still needs some stress testing to find any corner case bugs, but it
shouldn't be too early to share some numbers on Clang 10 / MacOS:

master:

chinese | mixed | ascii
---------+-------+-------
1082 | 751 | 364

v3 with SSE4.1:

chinese | mixed | ascii
---------+-------+-------
127 | 128 | 126

Some caveats and notes:

- It takes almost no recognizable code from simdjson, but it does take the
magic constants lookup tables almost verbatim. The main body of the code
has no intrinsics at all (I think). They're all hidden inside static inline
helper functions. I reused some cryptic variable names from simdjson. It's
a bit messy but not terrible.

- It diffs against the noError conversion patch and adds additional tests.

- It's not smart enough to stop at the last valid character boundary --
it's either all-valid or it must start over with the fallback. That will
have to change in order to work with the proposed noError conversions. It
shouldn't be very hard, but needs thought as to the clearest and safest way
to code it.

- There is no ascii fast-path yet. With this algorithm we have to be a bit
more careful since a valid ascii chunk could be preceded by an incomplete
sequence at the end of the previous chunk. Not too hard, just a bit more
work.

- This is my first time hacking autoconf, and it still seems slightly
broken, yet functional on my machine at least.

- It only needs SSE4.1, but I didn't want to create a whole new CFLAGS, so
it just reuses SSE4.2 for the runtime check and the macro names. Also, it
doesn't test for SSE2, it just insists on 64-bit for the runtime check. I
imagine it would refuse to build on 32-bit machines if you passed it -msse42

- There is a placeholder for Windows support, but it's not developed.

- I had to add a large number of casts to get rid of warnings in the magic
constants macros. That needs some polish.

I also attached a C file that visually demonstrates every step of the
algorithm following the example found in Table 9 in the paper. That
contains the skeleton coding I started with and got abandoned early, so it
might differ from the actual patch.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v3-SSE4-with-autoconf-support.patchapplication/octet-stream; name=v3-SSE4-with-autoconf-support.patchDownload

 config/c-compiler.m4                     |  34 +++
 configure                                | 147 ++++++++++-
 configure.ac                             |  62 ++++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       | 116 +++++++--
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  72 ++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 132 ++++++++++
 src/port/pg_utf8_sse42.c                 | 424 +++++++++++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  69 +++++
 src/test/regress/expected/conversion.out |  52 ++++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Solution.pm               |   3 +
 14 files changed, 1131 insertions(+), 26 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..a346d8429a 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,6 +591,40 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
+# PGAC_SSE42_UTF8_INTRINSICS
+# ---------------------------
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# Check if the compiler supports x86 instructions added in SSSE3 and SSE 4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We don't test for SSE2 intrinsics, but they are assumed to be present if
+# on x86-64 platforms.
+#
+# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
+# intrinsics are supported, sets pgac_sse41_intrinsics, and CFLAGS_SSE42.
+#
+# Note: We could create a new CFLAGS macro for SSE4.1, but it doesn't seem worth it.
+AC_DEFUN([PGAC_SSE42_UTF8_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_utf8_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
+  [ __m128i zero = _mm_setzero_si128();
+    return _mm_testz_si128(zero,
+      _mm_shuffle_epi8(zero,
+      _mm_alignr_epi8(zero, zero, 1)));])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_SSE42="$1"
+  pgac_sse42_utf8_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_SSE42_UTF8_INTRINSICS
+
+
 # PGAC_SSE42_CRC32_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
diff --git a/configure b/configure
index ce9ea36999..fd7e1c5e0f 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,7 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
+PG_UTF8_OBJS
 CFLAGS_SSE42
 have_win32_dbghelp
 LIBOBJS
@@ -17670,6 +17671,93 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+# Check for Intel SSSE3 and SSE 4.1 intrinsics for UTF-8 validation.
+# Note: we reuse the flag, runtime check, and naming scheme used for SSE4.2.
+#
+# First check if the _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128
+# intrinsics can be used
+# with the default compiler flags. If not, check if adding the -msse4.2
+# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_utf8_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <nmmintrin.h>
+int
+main ()
+{
+ __m128i zero = _mm_setzero_si128();
+    return _mm_testz_si128(zero,
+      _mm_shuffle_epi8(zero,
+      _mm_alignr_epi8(zero, zero, 1)));
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_sse42_utf8_intrinsics_=yes
+else
+  pgac_cv_sse42_utf8_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_utf8_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_utf8_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_utf8_intrinsics_" = x"yes"; then
+  CFLAGS_SSE42=""
+  pgac_sse42_utf8_intrinsics=yes
+fi
+
+if test x"$pgac_sse42_utf8_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_utf8_intrinsics__msse4_2+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -msse4.2"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <nmmintrin.h>
+int
+main ()
+{
+ __m128i zero = _mm_setzero_si128();
+    return _mm_testz_si128(zero,
+      _mm_shuffle_epi8(zero,
+      _mm_alignr_epi8(zero, zero, 1)));
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_sse42_utf8_intrinsics__msse4_2=yes
+else
+  pgac_cv_sse42_utf8_intrinsics__msse4_2=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_utf8_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_utf8_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_utf8_intrinsics__msse4_2" = x"yes"; then
+  CFLAGS_SSE42="-msse4.2"
+  pgac_sse42_utf8_intrinsics=yes
+fi
+
+fi
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
@@ -17777,6 +17865,63 @@ if ac_fn_c_try_compile "$LINENO"; then :
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+$as_echo "slicing-by-8" >&6; }
+  fi
+fi
+
+
 # Check for ARMv8 CRC Extension intrinsics to do CRC calculations.
 #
 # First check if __crc32c* intrinsics can be used with the default compiler
@@ -17903,7 +18048,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 07da84d401..d18965cde5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2017,6 +2017,19 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+# Check for Intel SSSE3 and SSE 4.1 intrinsics for UTF-8 validation.
+# Note: we reuse the flag, runtime check, and naming scheme used for SSE4.2.
+#
+# First check if the _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128
+# intrinsics can be used
+# with the default compiler flags. If not, check if adding the -msse4.2
+# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
+PGAC_SSE42_UTF8_INTRINSICS([])
+if test x"$pgac_sse42_utf8_intrinsics" != x"yes"; then
+  PGAC_SSE42_UTF8_INTRINSICS([-msse4.2])
+fi
+AC_SUBST(CFLAGS_SSE42)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
@@ -2036,6 +2049,53 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
 #endif
 ])], [SSE4_2_TARGETED=1])
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(slicing-by-8)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
+
 # Check for ARMv8 CRC Extension intrinsics to do CRC calculations.
 #
 # First check if __crc32c* intrinsics can be used with the default compiler
@@ -2084,7 +2144,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..742957e67e 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1760,30 +1851,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	return pg_validate_utf8(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 55cab4d2bf..303dae4441 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -905,6 +905,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..a259c59cf5
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_CRC32C)
+/* Use Intel SSE4.2 instructions. */
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+extern int (*pg_validate_utf8) (const unsigned char *s, int len);
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_CRC32C */
+
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1, half2,
+				highbit_mask;
+
+	if  (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/*
+		 * If there are any zero bytes, bail and let the slow
+		 * path handle it.
+		 */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1615c48233
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,132 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1, b2, b3, b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if ((*s & 0b11100000) == 0b11000000)
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if ((*s & 0b11110000) == 0b11100000)
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if ((*s & 0b11111000) == 0b11110000)
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong:
+			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/*
+			 * check too large:
+			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..417f2142c7
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,424 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse2.c
+ *	  Validate UTF-8 with Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/* TODO: cite paper */
+
+/*
+ * Lookup tables for classifying two-byte sequences
+ *
+ * These constants were taken nearly verbatim from simdjson (Apache 2.0 license)
+ *
+ * XXX had to add a bunch of casts to prevent warnings -- needs more work
+ *
+ * IMHO a better symbol name for TOO_LONG is ASC_CONT
+ *
+ * simdjson also didn't seem to put the numerical values in a logical order,
+ * but the only one that MUST be as below is TWO_CONTS, since that indicates
+ * we can't say there's an error until we look at previous bytes.
+ */
+#define TOO_SHORT   (uint8) (1 << 0)	/* 11______ 0_______ */
+										/* 11______ 11______ */
+#define TOO_LONG	(uint8) (1 << 1)	/* 0_______ 10______ */
+#define OVERLONG_3	(uint8) (1 << 2)	/* 11100000 100_____ */
+#define SURROGATE	(uint8) (1 << 4)	/* 11101101 101_____ */
+#define OVERLONG_2	(uint8) (1 << 5)	/* 1100000_ 10______ */
+#define TWO_CONTS	(uint8) (1 << 7)	/* 10______ 10______ */
+#define TOO_LARGE	(uint8) (1 << 3)	/* 11110100 1001____ */
+										/* 11110100 101_____ */
+										/* 11110101 1001____ */
+										/* 11110101 101_____ */
+										/* 1111011_ 1001____ */
+										/* 1111011_ 101_____ */
+										/* 11111___ 1001____ */
+										/* 11111___ 101_____ */
+#define TOO_LARGE_1000 (uint8) (1 << 6)	/* 11110101 1000____ */
+										/* 1111011_ 1000____ */
+										/* 11111___ 1000____ */
+#define OVERLONG_4	(uint8) (1 << 6)	/* 11110000 1000____ */
+
+/* These all have ____ in byte 1 */
+#define CARRY (uint8) (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/* XXX the following tables could just be static variables */
+
+/*
+ * table for looking up possible errors in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_1_high_table()
+{
+	return _mm_setr_epi8(
+		// 0_______ ________ <ASCII in byte 1>
+		TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+		TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+		// 10______ ________ <continuation in byte 1>
+		TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+		// 1100____ ________ <two byte lead in byte 1>
+		TOO_SHORT | OVERLONG_2,
+		// 1101____ ________ <two byte lead in byte 1>
+		TOO_SHORT,
+		// 1110____ ________ <three byte lead in byte 1>
+		TOO_SHORT | OVERLONG_3 | SURROGATE,
+		// 1111____ ________ <four+ byte lead in byte 1>
+		TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+	);
+}
+
+/*
+ * table for looking up possible errors in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_1_low_table()
+{
+	return _mm_setr_epi8(
+		// ____0000 ________
+		(uint8) (CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4),
+		// ____0001 ________
+		(uint8) (CARRY | OVERLONG_2),
+		// ____001_ ________
+		CARRY,
+		CARRY,
+
+		// ____0100 ________
+		(uint8) (CARRY | TOO_LARGE),
+		// ____0101 ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		// ____011_ ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+
+		// ____1___ ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		// ____1101 ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000)
+	);
+}
+
+/*
+ * table for looking up possible errors in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_2_high_table()
+{
+	return _mm_setr_epi8(
+		// ________ 0_______ <ASCII in byte 2>
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+		// ________ 1000____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4),
+		// ________ 1001____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE),
+		// ________ 101_____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE),
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE),
+
+		// ________ 11______
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+	);
+}
+
+/* helper functions to wrap intrinsics */
+
+/* return a zeroed vector */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory and return the register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with set bits where any bytes in the input are zero */
+static inline const __m128i
+has_zero(const __m128i v)
+{
+	return _mm_cmpeq_epi8(v, vzero());
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(uint8 byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask of 1-bytes shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const __m128i shift16 = _mm_srli_epi16(v, n);
+	const __m128i mask = splat(0xFF >> n);
+	return _mm_and_si128(shift16, mask);
+}
+
+/* Bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/* _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is zero. */
+	return !_mm_testz_si128(v, v);
+}
+
+/*
+ * Shift entire "input" register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * "prev" register. Can be stated in C thusly:
+ *
+ * (prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 1-byte lane in the input, use that value as an index
+ * into the lookup register as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/* The actual algorithm */
+
+/*
+ * classify each 2-byte sequence in the input register
+ *
+ * Technically, it leaves off the last byte, but we'll get it
+ * from the "prev" register on the next loop iteration.
+ */
+static inline __m128i
+classify(const __m128i prev, const __m128i input)
+{
+	const __m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const __m128i byte_1_high = shift_right(input_shift1, 4);
+	const __m128i byte_1_low  = bitwise_and(input_shift1, splat(0x0F));
+	const __m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table());
+	const __m128i lookup_1_low  = lookup(byte_1_low, byte_1_low_table());
+	const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table());
+
+	/*
+	 * AND all the lookups together. At this point, non-zero
+	 * values in vector returned represent
+	 *
+	 * 1) invalid 2-byte sequences
+	 * 2) the second continuation byte of a possible 3- or 4-byte character
+	 * 3) the third continuation byte of a possible 4-byte character
+	 */
+	return bitwise_and(bitwise_and(lookup_1_high, lookup_1_low), lookup_2_high);
+}
+
+/*
+ * Return a mask of locations of lead bytes for 3- and 4-byte characters.
+ * Such lead bytes are found 2 and 3 bytes earlier in the sequence, respectivel.
+ */
+static inline __m128i
+get_lead_byte_mask(const __m128i prev, const __m128i input, const __m128i special_cases)
+{
+	/* create registers that are shifted up by 2 and 3 bytes */
+	const __m128i input_shift2 = prev2(prev, input);
+	const __m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Look in the shifted registers for valid 3- or 4-byte leads.
+	 * There is no unsigned comparison, so we use saturating subtraction
+	 * followed by signed comparison with zero. Any non-zero bytes
+	 * in the result represent valid leads.
+	 */
+	const __m128i is_third_byte  = saturating_sub(input_shift2, splat(0b11100000u-1));
+	const __m128i is_fourth_byte = saturating_sub(input_shift3, splat(0b11110000u-1));
+
+	/* OR them together for easier comparison */
+	const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * If we find valid leads 2 or 3 bytes previous, set all bits for the current byte.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const __m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * greater_than() sets all bits in the result when true. We want to compare
+	 * with the result of the classifier so apply a mask to allow only the high bit
+	 * to be set. This matches the TWO_CONTS symbol above.
+	 */
+	return bitwise_and(must23, splat(0x80));
+}
+
+static const __m128i
+check_utf8_bytes(const __m128i prev, const __m128i input)
+{
+	const __m128i special_cases = classify(prev, input);
+	const __m128i lead_byte_mask = get_lead_byte_mask(prev, input, special_cases);
+	return bitwise_xor(lead_byte_mask, special_cases);
+}
+
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int orig_len = len;
+
+	/*
+	 * The first time through the loop we have no previous input or error,
+	 * so use a zeroed register.
+	 */
+	__m128i prev = vzero();
+	__m128i error = vzero();
+	__m128i input;
+
+	while (len >= sizeof(__m128i))
+	{
+		input = vload(s);
+
+		/* check for zeros */
+		error = bitwise_or(error, has_zero(input));
+
+		/* TODO: fast path for ascii bytes? */
+
+		/* do the UTF-8 validation */
+		error = bitwise_or(error, check_utf8_bytes(prev, input));
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	if (len > 0)
+	{
+		/*
+		 * We don't have enough remaining input bytes for a full register,
+		 * so back-fill with zero bytes.
+		 */
+		unsigned char inbuf[sizeof(__m128i)];
+		memset(inbuf, 0, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		/*
+		 * Likewise, when we do the zero check, we don't want the trailing
+		 * zeros to cause false positives, so create a buffer to load
+		 * into a mask register
+		 */
+		unsigned char maskbuf[sizeof(__m128i)];
+		memset(maskbuf, 0, sizeof(__m128i));
+		/* XXX is this portable? */
+		memset(maskbuf + len, -1, sizeof(__m128i) - len);
+
+		const __m128i trailing_mask = vload(maskbuf);
+
+		/* check for zeros */
+		error = bitwise_or(error, has_zero(bitwise_and(input, trailing_mask)));
+
+		/* do the UTF-8 validation */
+		error = bitwise_or(error, check_utf8_bytes(prev, input));
+	}
+
+	// FIXME: in the new noError conversions, we could have incomplete bytes
+	// at the end. We'll need some extra logic to find the end of the
+	// last verified character.
+	// For now, it's correct to give up on any error.
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..263b840150
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 2aa062b2c9..5f778570ee 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -489,6 +489,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8 => undef,
+		USE_SSE42_UTF8 => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => undef,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,

test-utf8.capplication/octet-stream; name=test-utf8.cDownload

#14

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#13)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

I wrote:

[v3]
- It's not smart enough to stop at the last valid character boundary --

it's either all-valid or it must start over with the fallback. That will
have to change in order to work with the proposed noError conversions. It
shouldn't be very hard, but needs thought as to the clearest and safest way
to code it.

In v4, it should be able to return an accurate count of valid bytes even
when the end crosses a character boundary.

- This is my first time hacking autoconf, and it still seems slightly

broken, yet functional on my machine at least.

It was actually completely broken if you tried to pass the special flags to
configure. I redesigned this part and it seems to work now.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v4-SSE4-with-autoconf-support.patchapplication/octet-stream; name=v4-SSE4-with-autoconf-support.patchDownload

 config/c-compiler.m4                     |  27 +-
 configure                                | 117 +++++++--
 configure.ac                             |  64 ++++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       | 117 +++++++--
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  83 ++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 132 ++++++++++
 src/port/pg_utf8_sse42.c                 | 433 +++++++++++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  69 +++++
 src/test/regress/expected/conversion.out |  52 ++++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Solution.pm               |   3 +
 14 files changed, 1077 insertions(+), 66 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..14c5f096c1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,35 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
+])# PGAC_SSE42_INTRINSICS
 
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
diff --git a/configure b/configure
index ce9ea36999..ffc8ba5971 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17670,14 +17671,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17691,32 +17692,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17730,26 +17734,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -17884,12 +17891,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -17903,7 +17910,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -17957,6 +17964,64 @@ fi
 
 
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
+
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
   if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then
diff --git a/configure.ac b/configure.ac
index 07da84d401..235d749959 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2017,14 +2017,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2065,12 +2065,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2084,7 +2084,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2122,6 +2122,54 @@ fi
 AC_SUBST(PG_CRC32C_OBJS)
 
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
+
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
   if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..9407d0a79a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1760,30 +1851,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 55cab4d2bf..303dae4441 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -905,6 +905,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..dac2afc130
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,83 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int (*pg_validate_utf8) (const unsigned char *s, int len);
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1, half2,
+				highbit_mask;
+
+	if  (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/*
+		 * If there are any zero bytes, bail and let the slow
+		 * path handle it.
+		 */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1615c48233
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,132 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1, b2, b3, b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if ((*s & 0b11100000) == 0b11000000)
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if ((*s & 0b11110000) == 0b11100000)
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if ((*s & 0b11111000) == 0b11110000)
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong:
+			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/*
+			 * check too large:
+			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..6dfeb04c4c
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,433 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 with Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/* TODO: cite paper */
+
+/*
+ * Lookup tables for classifying two-byte sequences
+ *
+ * These constants were taken nearly verbatim from simdjson (Apache 2.0 license)
+ *
+ * XXX had to add a bunch of casts to prevent warnings -- needs more work
+ *
+ * IMHO a better symbol name for TOO_LONG is ASC_CONT
+ *
+ * simdjson also didn't seem to put the numerical values in a logical order,
+ * but the only one that MUST be as below is TWO_CONTS, since that indicates
+ * we can't say there's an error until we look at previous bytes.
+ */
+#define TOO_SHORT   (uint8) (1 << 0)	/* 11______ 0_______ */
+										/* 11______ 11______ */
+#define TOO_LONG	(uint8) (1 << 1)	/* 0_______ 10______ */
+#define OVERLONG_3	(uint8) (1 << 2)	/* 11100000 100_____ */
+#define SURROGATE	(uint8) (1 << 4)	/* 11101101 101_____ */
+#define OVERLONG_2	(uint8) (1 << 5)	/* 1100000_ 10______ */
+#define TWO_CONTS	(uint8) (1 << 7)	/* 10______ 10______ */
+#define TOO_LARGE	(uint8) (1 << 3)	/* 11110100 1001____ */
+										/* 11110100 101_____ */
+										/* 11110101 1001____ */
+										/* 11110101 101_____ */
+										/* 1111011_ 1001____ */
+										/* 1111011_ 101_____ */
+										/* 11111___ 1001____ */
+										/* 11111___ 101_____ */
+#define TOO_LARGE_1000 (uint8) (1 << 6)	/* 11110101 1000____ */
+										/* 1111011_ 1000____ */
+										/* 11111___ 1000____ */
+#define OVERLONG_4	(uint8) (1 << 6)	/* 11110000 1000____ */
+
+/* These all have ____ in byte 1 */
+#define CARRY (uint8) (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/* XXX the following tables could just be static variables */
+
+/*
+ * table for looking up possible errors in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_1_high_table()
+{
+	return _mm_setr_epi8(
+		// 0_______ ________ <ASCII in byte 1>
+		TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+		TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+		// 10______ ________ <continuation in byte 1>
+		TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+		// 1100____ ________ <two byte lead in byte 1>
+		TOO_SHORT | OVERLONG_2,
+		// 1101____ ________ <two byte lead in byte 1>
+		TOO_SHORT,
+		// 1110____ ________ <three byte lead in byte 1>
+		TOO_SHORT | OVERLONG_3 | SURROGATE,
+		// 1111____ ________ <four+ byte lead in byte 1>
+		TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+	);
+}
+
+/*
+ * table for looking up possible errors in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_1_low_table()
+{
+	return _mm_setr_epi8(
+		// ____0000 ________
+		(uint8) (CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4),
+		// ____0001 ________
+		(uint8) (CARRY | OVERLONG_2),
+		// ____001_ ________
+		CARRY,
+		CARRY,
+
+		// ____0100 ________
+		(uint8) (CARRY | TOO_LARGE),
+		// ____0101 ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		// ____011_ ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+
+		// ____1___ ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		// ____1101 ________
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000),
+		(uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000)
+	);
+}
+
+/*
+ * table for looking up possible errors in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+static inline const __m128i
+byte_2_high_table()
+{
+	return _mm_setr_epi8(
+		// ________ 0_______ <ASCII in byte 2>
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+		// ________ 1000____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4),
+		// ________ 1001____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE),
+		// ________ 101_____
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE),
+		(uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE),
+
+		// ________ 11______
+		TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+	);
+}
+
+/* helper functions to wrap intrinsics */
+
+/* return a zeroed vector */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory and return the register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with set bits where any bytes in the input are zero */
+static inline const __m128i
+has_zero(const __m128i v)
+{
+	return _mm_cmpeq_epi8(v, vzero());
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(uint8 byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask of 1-bytes shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const __m128i shift16 = _mm_srli_epi16(v, n);
+	const __m128i mask = splat(0xFF >> n);
+	return _mm_and_si128(shift16, mask);
+}
+
+/* Bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/* _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is zero. */
+	return !_mm_testz_si128(v, v);
+}
+
+/*
+ * Shift entire "input" register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * "prev" register. Can be stated in C thusly:
+ *
+ * (prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 1-byte lane in the input, use that value as an index
+ * into the lookup register as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/* The actual algorithm */
+
+/*
+ * classify each 2-byte sequence in the input register
+ *
+ * Technically, it leaves off the last byte, but we'll get it
+ * from the "prev" register on the next loop iteration.
+ */
+static inline __m128i
+classify(const __m128i prev, const __m128i input)
+{
+	const __m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const __m128i byte_1_high = shift_right(input_shift1, 4);
+	const __m128i byte_1_low  = bitwise_and(input_shift1, splat(0x0F));
+	const __m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table());
+	const __m128i lookup_1_low  = lookup(byte_1_low, byte_1_low_table());
+	const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table());
+
+	/*
+	 * AND all the lookups together. At this point, non-zero
+	 * values in vector returned represent
+	 *
+	 * 1) invalid 2-byte sequences
+	 * 2) the second continuation byte of a possible 3- or 4-byte character
+	 * 3) the third continuation byte of a possible 4-byte character
+	 */
+	return bitwise_and(bitwise_and(lookup_1_high, lookup_1_low), lookup_2_high);
+}
+
+/*
+ * Return a mask of locations of lead bytes for 3- and 4-byte characters.
+ * Such lead bytes are found 2 and 3 bytes earlier in the sequence, respectively.
+ */
+static inline __m128i
+get_lead_byte_mask(const __m128i prev, const __m128i input)
+{
+	/* create registers that are shifted up by 2 and 3 bytes */
+	const __m128i input_shift2 = prev2(prev, input);
+	const __m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Look in the shifted registers for valid 3- or 4-byte leads.
+	 * There is no unsigned comparison, so we use saturating subtraction
+	 * followed by signed comparison with zero. Any non-zero bytes
+	 * in the result represent valid leads.
+	 */
+	const __m128i is_third_byte  = saturating_sub(input_shift2, splat(0b11100000u-1));
+	const __m128i is_fourth_byte = saturating_sub(input_shift3, splat(0b11110000u-1));
+
+	/* OR them together for easier comparison */
+	const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * If we find valid leads 2 or 3 bytes previous, set all bits for the current byte.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const __m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * greater_than() sets all bits in the result when true. We want to compare
+	 * with the result of the classifier so apply a mask to allow only the high bit
+	 * to be set. This matches the TWO_CONTS symbol above.
+	 */
+	return bitwise_and(must23, splat(0x80));
+}
+
+static const __m128i
+check_utf8_bytes(const __m128i prev, const __m128i input)
+{
+	const __m128i special_cases = classify(prev, input);
+	const __m128i lead_byte_mask = get_lead_byte_mask(prev, input);
+	return bitwise_xor(lead_byte_mask, special_cases);
+}
+
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	bool		found_zero = false;
+
+	/*
+	 * The first time through the loop we have no previous input or error,
+	 * so use a zeroed register.
+	 */
+	__m128i prev = vzero();
+	__m128i error = vzero();
+	__m128i input;
+
+	while (len >= sizeof(__m128i))
+	{
+		input = vload(s);
+
+		/* check for zeros */
+		error = bitwise_or(error, has_zero(input));
+
+		/* TODO: fast path for ascii bytes */
+
+		/* do the UTF-8 validation */
+		error = bitwise_or(error, check_utf8_bytes(prev, input));
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/* If we saw an error while validating full chunks, use the fallback. */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+
+	/*
+	 * Validate last section, which must be smaller than the register size.
+	 * If the original input is a multiple of 16 bytes in size, the last
+	 * part will be zero, but remember we must still shift in the last lane
+	 * from the previous chunk.
+	 */
+	Assert(len < sizeof(__m128i));
+
+	/* Fill enough bytes so we have a whole register. */
+	unsigned char inbuf[sizeof(__m128i)];
+	memset(inbuf, 0, sizeof(__m128i));
+	memcpy(inbuf, s, len);
+
+	input = vload(inbuf);
+
+	/*
+	 * Check for zeros. we can't use vector operations,
+	 * since there are trailing zeroes after the data.
+	 */
+	for (int i = 0; i < len; i++)
+		found_zero |= (s[i] == '\0');
+
+	/* do the UTF-8 validation */
+	error = bitwise_or(error, check_utf8_bytes(prev, input));
+
+	if (found_zero || to_bool(error))
+	{
+		/* Find a possible start of a character in the previous chunk. */
+		while(s > start)
+		{
+			if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+				(*s & 0b11100000) == 0b11000000 ||
+				(*s & 0b11110000) == 0b11100000 ||
+				(*s & 0b11111000) == 0b11110000)
+				break;
+
+			s--;
+			len++;
+		}
+		return orig_len - len + pg_validate_utf8_fallback(s, len);
+	}
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..263b840150
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 2aa062b2c9..236c0a857b 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -489,6 +489,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => undef,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,

#15

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#13)

4 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On Mon, Feb 15, 2021 at 9:32 PM John Naylor <john.naylor@enterprisedb.com>
wrote:

On Mon, Feb 15, 2021 at 9:18 AM Heikki Linnakangas <hlinnaka@iki.fi>

wrote:

I'm guessing that's because the unaligned access in check_ascii() is
expensive on this platform.

Some possible remedies:

3) #ifdef out the ascii check for 32-bit platforms.

4) Same as the non-UTF8 case -- only check for ascii 8 bytes at a time.

I'll probably try this first.

I've attached a couple patches to try on top of v4; maybe they'll help the
Arm32 regression. 01 reduces the stride to 8 bytes, and 02 applies on top
of v1 to disable the fallback fast path entirely on 32-bit platforms. A bit
of a heavy hammer, but it'll confirm (or not) your theory about unaligned
loads.

Also, I've included patches to explain more fully how I modeled non-UTF-8
performance while still using the UTF-8 tests. I think it was a useful
thing to do, and I have a theory that might predict how a non-UTF8 encoding
will perform with the fast path.

03A and 03B are independent of each other and conflict, but both apply on
top of v4 (don't need 02). Both replace the v4 fallback with the ascii
fastpath + pg_utf8_verifychar() in the loop, similar to utf-8 on master.
03A has a local static copy of pg_utf8_islegal(), and 03B uses the existing
global function. (On x86, you can disable SSE4 by passing
USE_FALLBACK_UTF8=1 to configure.)

While Clang 10 regressed for me on pure multibyte in a similar test
upthread, on Linux gcc 8.4 there isn't a regression at all. IIRC, gcc
wasn't as good as Clang when the API changed a few weeks ago, so its
regression from v4 is still faster than master. Clang only regressed with
my changes because it somehow handled master much better to begin with.

x86-64 Linux gcc 8.4

master

chinese | mixed | ascii
---------+-------+-------
1453 | 857 | 428

v4 (fallback verifier written as a single function)

chinese | mixed | ascii
---------+-------+-------
815 | 514 | 82

v4 plus addendum 03A -- emulate non-utf-8 using a copy of
pg_utf8_is_legal() as a static function

chinese | mixed | ascii
---------+-------+-------
1115 | 547 | 87

v4 plus addendum 03B -- emulate non-utf-8 using pg_utf8_is_legal() as a
global function

chinese | mixed | ascii
---------+-------+-------
1279 | 604 | 82

(I also tried the same on ppc64le Linux, gcc 4.8.5 and while not great, it
never got worse than master either on pure multibyte.)

This is supposed to model the performance of a non-utf8 encoding, where we
don't have a bespoke function written from scratch. Here's my theory: If an
encoding has pg_*_mblen(), a global function, inside pg_*_verifychar(), it
seems it won't benefit as much from an ascii fast path as one whose
pg_*_verifychar() has no function calls. I'm not sure whether a compiler
can inline a global function's body into call sites in the unit where it's
defined. (I haven't looked at the assembly.) But recall that you didn't
commit 0002 from the earlier encoding change, because it wasn't performing.
I looked at that patch again, and while it inlined the pg_utf8_verifychar()
call, it still called the global function pg_utf8_islegal().

If the above is anything to go by, on gcc at least, I don't think we need
to worry about a regression when adding an ascii fast path to non-utf-8
multibyte encodings.

Regarding SSE, I've added an ascii fast path in my local branch, but it's
not going to be as big a difference because 1) the check is more expensive
in terms of branches than the C case, and 2) because the general case is so
fast already, it's hard to improve upon. I just need to do some testing and
cleanup on the whole thing, and that'll be ready to share.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

addendum-01-8-byte-stride.patchapplication/x-patch; name=addendum-01-8-byte-stride.patchDownload

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dac2afc130..a0c94dd4f3 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -53,26 +53,25 @@ extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
 static inline int
 check_ascii(const unsigned char *s, int len)
 {
-	uint64		half1, half2,
+	uint64		chunk,
 				highbit_mask;
 
-	if  (len >= 2 * sizeof(uint64))
+	if  (len >= sizeof(uint64))
 	{
-		memcpy(&half1, s, sizeof(uint64));
-		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+		memcpy(&chunk, s, sizeof(uint64));
 
 		/*
 		 * If there are any zero bytes, bail and let the slow
 		 * path handle it.
 		 */
-		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+		if (HAS_ZERO(chunk))
 			return 0;
 
 		/* Check if any bytes in this chunk have the high bit set. */
-		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+		highbit_mask = (chunk & UINT64CONST(0x8080808080808080));
 
 		if (!highbit_mask)
-			return 2 * sizeof(uint64);
+			return sizeof(uint64);
 		else
 			return 0;
 	}

addendum-02-remove-ascii-fast-path-32-bit.patchapplication/x-patch; name=addendum-02-remove-ascii-fast-path-32-bit.patchDownload

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index a0c94dd4f3..62272a7c93 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -53,6 +53,7 @@ extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
 static inline int
 check_ascii(const unsigned char *s, int len)
 {
+#if SIZEOF_VOID_P >= 8
 	uint64		chunk,
 				highbit_mask;
 
@@ -75,7 +76,9 @@ check_ascii(const unsigned char *s, int len)
 		else
 			return 0;
 	}
-
+	else
+		return 0;
+#endif
 	return 0;
 }

addendum-03A-emulate-non-utf8-multibyte-STATIC.patchapplication/x-patch; name=addendum-03A-emulate-non-utf8-multibyte-STATIC.patchDownload

diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 1615c48233..af1d636331 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -17,8 +17,91 @@
 
 #include "port/pg_utf8.h"
 
+static bool
+pg_utf8_islegal(const unsigned char *source, int length)
+{
+	unsigned char a;
+
+	switch (length)
+	{
+		default:
+			/* reject lengths 5 and 6 for now */
+			return false;
+		case 4:
+			a = source[3];
+			if (a < 0x80 || a > 0xBF)
+				return false;
+			/* FALL THRU */
+		case 3:
+			a = source[2];
+			if (a < 0x80 || a > 0xBF)
+				return false;
+			/* FALL THRU */
+		case 2:
+			a = source[1];
+			switch (*source)
+			{
+				case 0xE0:
+					if (a < 0xA0 || a > 0xBF)
+						return false;
+					break;
+				case 0xED:
+					if (a < 0x80 || a > 0x9F)
+						return false;
+					break;
+				case 0xF0:
+					if (a < 0x90 || a > 0xBF)
+						return false;
+					break;
+				case 0xF4:
+					if (a < 0x80 || a > 0x8F)
+						return false;
+					break;
+				default:
+					if (a < 0x80 || a > 0xBF)
+						return false;
+					break;
+			}
+			/* FALL THRU */
+		case 1:
+			a = *source;
+			if (a >= 0x80 && a < 0xC2)
+				return false;
+			if (a > 0xF4)
+				return false;
+			break;
+	}
+	return true;
+}
+
+static int
+pg_utf8_verifychar(const unsigned char *s, int len)
+{
+	int			l;
 
-#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+	if ((*s & 0x80) == 0)
+	{
+		if (*s == '\0')
+			return -1;
+		return 1;
+	}
+	else if ((*s & 0xe0) == 0xc0)
+		l = 2;
+	else if ((*s & 0xf0) == 0xe0)
+		l = 3;
+	else if ((*s & 0xf8) == 0xf0)
+		l = 4;
+	else
+		l = 1;
+
+	if (l > len)
+		return -1;
+
+	if (!pg_utf8_islegal(s, l))
+		return -1;
+
+	return l;
+}
 
 /*
  * See the comment in common/wchar.c under "multibyte sequence validators".
@@ -27,7 +110,6 @@ int
 pg_validate_utf8_fallback(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
-	unsigned char b1, b2, b3, b4;
 
 	while (len > 0)
 	{
@@ -49,81 +131,12 @@ pg_validate_utf8_fallback(const unsigned char *s, int len)
 				break;
 			l = 1;
 		}
-		/* code points U+0080 through U+07FF */
-		else if ((*s & 0b11100000) == 0b11000000)
-		{
-			l = 2;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-
-			if (!IS_CONTINUATION_BYTE(b2))
-				break;
-
-			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-			if (b1 < 0xC2)
-				break;
-		}
-		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-		else if ((*s & 0b11110000) == 0b11100000)
-		{
-			l = 3;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3))
-				break;
-
-			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-			if (b1 == 0xE0 && b2 < 0xA0)
-				break;
-
-			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-			if (b1 == 0xED && b2 > 0x9F)
-				break;
-		}
-		/* code points U+010000 through U+10FFFF */
-		else if ((*s & 0b11111000) == 0b11110000)
+		else
 		{
-			l = 4;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-			b4 = *(s + 3);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3) ||
-				!IS_CONTINUATION_BYTE(b4))
-				break;
-
-			/*
-			 * check 4-byte overlong:
-			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if (b1 == 0xF0 && b2 < 0x90)
-				break;
-
-			/*
-			 * check too large:
-			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			l = pg_utf8_verifychar(s, len);
+			if (l == -1)
 				break;
 		}
-		else
-			/* invalid byte */
-			break;
-
 		s += l;
 		len -= l;
 	}

addendum-03B-emulate-non-utf8-multibyte-GLOBAL.patchapplication/x-patch; name=addendum-03B-emulate-non-utf8-multibyte-GLOBAL.patchDownload

diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 1615c48233..355f44085b 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -16,9 +16,36 @@
 #include "c.h"
 
 #include "port/pg_utf8.h"
+#include "mb/pg_wchar.h"
 
+static int
+pg_utf8_verifychar(const unsigned char *s, int len)
+{
+	int			l;
 
-#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+	if ((*s & 0x80) == 0)
+	{
+		if (*s == '\0')
+			return -1;
+		return 1;
+	}
+	else if ((*s & 0xe0) == 0xc0)
+		l = 2;
+	else if ((*s & 0xf0) == 0xe0)
+		l = 3;
+	else if ((*s & 0xf8) == 0xf0)
+		l = 4;
+	else
+		l = 1;
+
+	if (l > len)
+		return -1;
+
+	if (!pg_utf8_islegal(s, l))
+		return -1;
+
+	return l;
+}
 
 /*
  * See the comment in common/wchar.c under "multibyte sequence validators".
@@ -27,7 +54,6 @@ int
 pg_validate_utf8_fallback(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
-	unsigned char b1, b2, b3, b4;
 
 	while (len > 0)
 	{
@@ -49,81 +75,12 @@ pg_validate_utf8_fallback(const unsigned char *s, int len)
 				break;
 			l = 1;
 		}
-		/* code points U+0080 through U+07FF */
-		else if ((*s & 0b11100000) == 0b11000000)
-		{
-			l = 2;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-
-			if (!IS_CONTINUATION_BYTE(b2))
-				break;
-
-			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-			if (b1 < 0xC2)
-				break;
-		}
-		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-		else if ((*s & 0b11110000) == 0b11100000)
-		{
-			l = 3;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3))
-				break;
-
-			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-			if (b1 == 0xE0 && b2 < 0xA0)
-				break;
-
-			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-			if (b1 == 0xED && b2 > 0x9F)
-				break;
-		}
-		/* code points U+010000 through U+10FFFF */
-		else if ((*s & 0b11111000) == 0b11110000)
+		else
 		{
-			l = 4;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-			b4 = *(s + 3);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3) ||
-				!IS_CONTINUATION_BYTE(b4))
-				break;
-
-			/*
-			 * check 4-byte overlong:
-			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if (b1 == 0xF0 && b2 < 0x90)
-				break;
-
-			/*
-			 * check too large:
-			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			l = pg_utf8_verifychar(s, len);
+			if (l == -1)
 				break;
 		}
-		else
-			/* invalid byte */
-			break;
-
 		s += l;
 		len -= l;
 	}

#16

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#13)

3 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

I made some substantial improvements in v5, and I've taken care of all my
TODOs below. I separated out the non-UTF-8 ascii fast path into a separate
patch, since it's kind of off-topic, and it's not yet clear it's always the
best thing to do.

- It takes almost no recognizable code from simdjson, but it does take

the magic constants lookup tables almost verbatim. The main body of the
code has no intrinsics at all (I think). They're all hidden inside static
inline helper functions. I reused some cryptic variable names from
simdjson. It's a bit messy but not terrible.

In v5, the lookup tables and their comments are cleaned up and modified to
play nice with pgindent.

- It diffs against the noError conversion patch and adds additional tests.

I wanted to get some cfbot testing, so I went ahead and prepended v4 of
Heikki's noError patch so it would apply against master.

- There is no ascii fast-path yet. With this algorithm we have to be a

bit more careful since a valid ascii chunk could be preceded by an
incomplete sequence at the end of the previous chunk. Not too hard, just a
bit more work.

v5 adds an ascii fast path.

- I had to add a large number of casts to get rid of warnings in the

magic constants macros. That needs some polish.

This is much nicer now, only one cast really necessary.

I'm pretty pleased with how it is now, but it could use some thorough
testing for correctness. I'll work on that a bit later.

On my laptop, Clang 10:

master:

chinese | mixed | ascii
---------+-------+-------
1081 | 761 | 366

v5:

chinese | mixed | ascii
---------+-------+-------
136 | 93 | 54

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v4-0001-Add-noError-argument-to-encoding-conversion-funct.patchapplication/octet-stream; name=v4-0001-Add-noError-argument-to-encoding-conversion-funct.patchDownload

From 7dfca1f7f5db3b9fbfe9a1ef9624f6ba5eecdbd0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Sun, 7 Feb 2021 17:10:12 +0200
Subject: [PATCH v4 1/2] Add 'noError' argument to encoding conversion
 functions.

With the 'noError' argument, you can try to convert a buffer without
knowing the character boundaries beforehand. The functions now need to
return the number of input bytes successfully converted.

This is is a backwards-incompatible change, if you have created a custom
encoding conversion with CREATE CONVERSION. This adds a check to
pg_upgrade for that, refusing the upgrade if there are any user-defined
encoding conversions.

Add regression tests for built-in encoding conversions. This doesn't cover
every conversion, but it covers all the internal functions in conv.c that
are used to implement the conversions.
---
 doc/src/sgml/ref/create_conversion.sgml       |   5 +-
 src/backend/commands/conversioncmds.c         |  30 +-
 src/backend/utils/error/elog.c                |   2 +
 src/backend/utils/mb/conv.c                   | 139 ++++-
 .../cyrillic_and_mic/cyrillic_and_mic.c       | 127 +++--
 .../euc2004_sjis2004/euc2004_sjis2004.c       |  94 +++-
 .../euc_cn_and_mic/euc_cn_and_mic.c           |  57 +-
 .../euc_jp_and_sjis/euc_jp_and_sjis.c         | 153 ++++--
 .../euc_kr_and_mic/euc_kr_and_mic.c           |  57 +-
 .../euc_tw_and_big5/euc_tw_and_big5.c         | 165 ++++--
 .../latin2_and_win1250/latin2_and_win1250.c   |  49 +-
 .../latin_and_mic/latin_and_mic.c             |  43 +-
 .../utf8_and_big5/utf8_and_big5.c             |  37 +-
 .../utf8_and_cyrillic/utf8_and_cyrillic.c     |  67 ++-
 .../utf8_and_euc2004/utf8_and_euc2004.c       |  37 +-
 .../utf8_and_euc_cn/utf8_and_euc_cn.c         |  37 +-
 .../utf8_and_euc_jp/utf8_and_euc_jp.c         |  37 +-
 .../utf8_and_euc_kr/utf8_and_euc_kr.c         |  37 +-
 .../utf8_and_euc_tw/utf8_and_euc_tw.c         |  37 +-
 .../utf8_and_gb18030/utf8_and_gb18030.c       |  37 +-
 .../utf8_and_gbk/utf8_and_gbk.c               |  37 +-
 .../utf8_and_iso8859/utf8_and_iso8859.c       |  43 +-
 .../utf8_and_iso8859_1/utf8_and_iso8859_1.c   |  35 +-
 .../utf8_and_johab/utf8_and_johab.c           |  37 +-
 .../utf8_and_sjis/utf8_and_sjis.c             |  37 +-
 .../utf8_and_sjis2004/utf8_and_sjis2004.c     |  37 +-
 .../utf8_and_uhc/utf8_and_uhc.c               |  37 +-
 .../utf8_and_win/utf8_and_win.c               |  43 +-
 src/backend/utils/mb/mbutils.c                |  76 ++-
 src/bin/pg_upgrade/check.c                    |  95 ++++
 src/include/catalog/pg_proc.dat               | 332 +++++------
 src/include/mb/pg_wchar.h                     |  35 +-
 src/test/regress/expected/conversion.out      | 519 ++++++++++++++++++
 src/test/regress/expected/opr_sanity.out      |   7 +-
 .../regress/input/create_function_1.source    |   4 +
 .../regress/output/create_function_1.source   |   3 +
 src/test/regress/regress.c                    | 134 +++++
 src/test/regress/sql/conversion.sql           | 185 +++++++
 src/test/regress/sql/opr_sanity.sql           |   7 +-
 39 files changed, 2322 insertions(+), 628 deletions(-)

diff --git a/doc/src/sgml/ref/create_conversion.sgml b/doc/src/sgml/ref/create_conversion.sgml
index e7700fecfc5..f014a676c88 100644
--- a/doc/src/sgml/ref/create_conversion.sgml
+++ b/doc/src/sgml/ref/create_conversion.sgml
@@ -117,8 +117,9 @@ conv_proc(
     integer,  -- destination encoding ID
     cstring,  -- source string (null terminated C string)
     internal, -- destination (fill with a null terminated C string)
-    integer   -- source string length
-) RETURNS void;
+    integer,  -- source string length
+    boolean   -- if true, don't throw an error if conversion fails
+) RETURNS integer;
 </programlisting></para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c
index f7ff321de71..59e73000206 100644
--- a/src/backend/commands/conversioncmds.c
+++ b/src/backend/commands/conversioncmds.c
@@ -45,8 +45,9 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	const char *from_encoding_name = stmt->for_encoding_name;
 	const char *to_encoding_name = stmt->to_encoding_name;
 	List	   *func_name = stmt->func_name;
-	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID};
+	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID, BOOLOID};
 	char		result[1];
+	Datum		funcresult;
 
 	/* Convert list of names to a name and namespace */
 	namespaceId = QualifiedNameGetCreationNamespace(stmt->conversion_name,
@@ -92,8 +93,8 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	funcoid = LookupFuncName(func_name, sizeof(funcargs) / sizeof(Oid),
 							 funcargs, false);
 
-	/* Check it returns VOID, else it's probably the wrong function */
-	if (get_func_rettype(funcoid) != VOIDOID)
+	/* Check it returns int4, else it's probably the wrong function */
+	if (get_func_rettype(funcoid) != INT4OID)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 				 errmsg("encoding conversion function %s must return type %s",
@@ -111,12 +112,23 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	 * string; the conversion function should throw an error if it can't
 	 * perform the requested conversion.
 	 */
-	OidFunctionCall5(funcoid,
-					 Int32GetDatum(from_encoding),
-					 Int32GetDatum(to_encoding),
-					 CStringGetDatum(""),
-					 CStringGetDatum(result),
-					 Int32GetDatum(0));
+	funcresult = OidFunctionCall6(funcoid,
+								  Int32GetDatum(from_encoding),
+								  Int32GetDatum(to_encoding),
+								  CStringGetDatum(""),
+								  CStringGetDatum(result),
+								  Int32GetDatum(0),
+								  BoolGetDatum(false));
+
+	/*
+	 * The function should return 0 for empty input. Might as well check that,
+	 * too.
+	 */
+	if (DatumGetInt32(funcresult) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("encoding conversion function %s returned incorrect result for empty input",
+						NameListToString(func_name))));
 
 	/*
 	 * All seem ok, go ahead (possible failure would be a duplicate conversion
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 80c26724612..762f77d533c 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2280,6 +2280,8 @@ write_console(const char *line, int len)
 	 * Conversion on non-win32 platforms is not implemented yet. It requires
 	 * non-throw version of pg_do_encoding_conversion(), that converts
 	 * unconvertable characters to '?' without errors.
+	 *
+	 * XXX: We have a no-throw version now. It doesn't convert to '?' though.
 	 */
 #endif
 
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a07b54bd3b8..33e9c9a9e3c 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -25,15 +25,20 @@
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
-			const unsigned char *tab)
+			const unsigned char *tab,
+			bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -41,7 +46,11 @@ local2local(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -50,13 +59,19 @@ local2local(const unsigned char *l,
 			if (c2)
 				*p++ = c2;
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -66,18 +81,26 @@ local2local(const unsigned char *l,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = l;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
@@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
+			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 
@@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l,
 				*p++ = c2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l,
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;
 
@@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic,
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
@@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic,
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
@@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt,
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
+	const unsigned char *start = utf;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len,
 
 			l = pg_utf_mblen(utf);
 			if (len < l)
+			{
+				/* need more data to decide if this is a combined char */
+				utf -= l_save;
 				break;
+			}
 
 			if (!pg_utf8_islegal(utf, l))
+			{
+				if (!noError)
+					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+				utf -= l_save;
 				break;
+			}
 
 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
@@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* failed to translate this character */
+		utf -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(PG_UTF8, encoding,
-								   (const char *) (utf - l), len);
+								   (const char *) utf, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 
 	*iso = '\0';
+
+	return utf - start;
 }
 
 /*
@@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len,
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding,
+		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
+	const unsigned char *start = iso;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len,
 		}
 
 		/* failed to translate this character */
+		iso -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(encoding, PG_UTF8,
-								   (const char *) (iso - l), len);
+								   (const char *) iso, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);
 
 	*utf = '\0';
+
+	return iso - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
index 4c5b02654de..368c2deb5e4 100644
--- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
@@ -44,8 +44,11 @@ PG_FUNCTION_INFO_V1(win866_to_iso);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -306,12 +309,14 @@ koi8r_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -320,12 +325,14 @@ mic_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_KOI8R);
 
-	mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -334,12 +341,14 @@ iso_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -348,12 +357,14 @@ mic_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_ISO_8859_5);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -362,12 +373,14 @@ win1251_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -376,12 +389,14 @@ mic_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1251);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -390,12 +405,14 @@ win866_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -404,12 +421,14 @@ mic_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN866);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -418,12 +437,14 @@ koi8r_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN1251);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -432,12 +453,14 @@ win1251_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -446,12 +469,14 @@ koi8r_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN866);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -460,12 +485,14 @@ win866_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi);
+	converted = local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -474,12 +501,14 @@ win866_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_WIN1251);
 
-	local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251);
+	converted = local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -488,12 +517,14 @@ win1251_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_WIN866);
 
-	local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -502,12 +533,14 @@ iso_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_KOI8R);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -516,12 +549,14 @@ koi8r_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -530,12 +565,14 @@ iso_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN1251);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -544,12 +581,14 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -558,12 +597,14 @@ iso_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN866);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -572,10 +613,12 @@ win866_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso);
+	converted = local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
index 4d7fb116cfd..a3fd35bd406 100644
--- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
@@ -19,8 +19,8 @@ PG_MODULE_MAGIC;
 PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004);
 PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004);
 
-static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len);
-static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len);
+static int	euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError);
 
 /* ----------
  * conv_proc(
@@ -28,8 +28,11 @@ static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -39,12 +42,14 @@ euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004);
 
-	euc_jis_20042shift_jis_2004(src, dest, len);
+	converted = euc_jis_20042shift_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -53,20 +58,23 @@ shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004);
 
-	shift_jis_20042euc_jis_2004(src, dest, len);
+	converted = shift_jis_20042euc_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_JIS_2004 -> SHIFT_JIS_2004
  */
-static void
-euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				ku,
 				ten;
@@ -79,8 +87,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -90,8 +102,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
 
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		if (c1 == SS2 && l == 2)	/* JIS X 0201 kana? */
 		{
@@ -121,8 +137,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 						*p++ = (ku + 0x19b) >> 1;
 					}
 					else
+					{
+						if (noError)
+							break;
 						report_invalid_encoding(PG_EUC_JIS_2004,
 												(const char *) euc, len);
+					}
 			}
 
 			if (ku % 2)
@@ -132,8 +152,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
@@ -149,8 +173,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 			else if (ku >= 63 && ku <= 94)
 				*p++ = (ku + 0x181) >> 1;
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 
 			if (ku % 2)
 			{
@@ -159,20 +187,30 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		euc += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
@@ -212,9 +250,10 @@ get_ten(int b, int *ku)
  * SHIFT_JIS_2004 ---> EUC_JIS_2004
  */
 
-static void
-shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len)
+static int
+shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1;
 	int			ku,
 				ten,
@@ -230,8 +269,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -241,8 +284,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
 
 		if (l < 0 || l > len)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SHIFT_JIS_2004,
 									(const char *) sjis, len);
+		}
 
 		if (c1 >= 0xa1 && c1 <= 0xdf && l == 1)
 		{
@@ -266,8 +313,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x100;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xe0 && c1 <= 0xef)	/* plane 1 62ku-94ku */
@@ -275,9 +326,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x180;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
-
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xf0 && c1 <= 0xf3)	/* plane 2
@@ -286,8 +340,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				switch (c1)
 				{
 					case 0xf0:
@@ -309,16 +367,24 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				if (c1 == 0xf4 && kubun == 1)
 					ku = 15;
 				else
 					ku = (c1 << 1) - 0x19a - kubun;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 
 			if (plane == 2)
 				*p++ = SS3;
@@ -330,4 +396,6 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
index e9bb896935f..09b3c2e75bf 100644
--- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_cn_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_MULE_INTERNAL);
 
-	euc_cn2mic(src, dest, len);
+	converted = euc_cn2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_CN);
 
-	mic2euc_cn(src, dest, len);
+	converted = mic2euc_cn(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_CN ---> MIC
  */
-static void
-euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 
 	while (len > 0)
@@ -76,7 +84,11 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = LC_GB2312_80;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -86,21 +98,28 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_CN
  */
-static void
-mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
@@ -109,11 +128,19 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (c1 != LC_GB2312_80)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
 										   (const char *) mic, len);
+			}
 			if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			mic++;
 			*p++ = *mic++;
 			*p++ = *mic++;
@@ -122,12 +149,18 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
index 5059f917a98..2e68708893d 100644
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -42,17 +42,20 @@ PG_FUNCTION_INFO_V1(mic_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static int	sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
+static int	mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -60,12 +63,14 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_SJIS);
 
-	euc_jp2sjis(src, dest, len);
+	converted = euc_jp2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -74,12 +79,14 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_EUC_JP);
 
-	sjis2euc_jp(src, dest, len);
+	converted = sjis2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -88,12 +95,14 @@ euc_jp_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_MULE_INTERNAL);
 
-	euc_jp2mic(src, dest, len);
+	converted = euc_jp2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -102,12 +111,14 @@ mic_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_JP);
 
-	mic2euc_jp(src, dest, len);
+	converted = mic2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -116,12 +127,14 @@ sjis_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_MULE_INTERNAL);
 
-	sjis2mic(src, dest, len);
+	converted = sjis2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -130,20 +143,23 @@ mic_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_SJIS);
 
-	mic2sjis(src, dest, len);
+	converted = mic2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * SJIS ---> MIC
  */
-static void
-sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -167,7 +183,11 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 			 * JIS X0208, X0212, user defined extended characters
 			 */
 			if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			c2 = sjis[1];
 			k = (c1 << 8) + c2;
 			if (k >= 0xed40 && k < 0xf040)
@@ -257,21 +277,28 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
 
 /*
  * MIC ---> SJIS
  */
-static void
-mic2sjis(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1,
 				c2,
 				k,
@@ -284,8 +311,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -293,8 +324,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 			*p++ = mic[1];
 		else if (c1 == LC_JISX0208)
@@ -350,20 +385,27 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 			}
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP ---> MIC
  */
-static void
-euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -374,8 +416,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -383,8 +429,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{						/* 1 byte kana? */
 			*p++ = LC_JISX0201K;
@@ -406,14 +456,17 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_JP
  */
-static void
-mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -424,8 +477,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -433,8 +490,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 		{
 			*p++ = SS2;
@@ -452,20 +513,27 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP -> SJIS
  */
-static void
-euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				c2,
 				k;
@@ -478,8 +546,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -487,8 +559,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{
 			/* hankaku kana? */
@@ -551,14 +627,17 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * SJIS ---> EUC_JP
  */
-static void
-sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -573,8 +652,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -582,8 +665,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SJIS,
 									(const char *) sjis, len);
+		}
 		if (c1 >= 0xa1 && c1 <= 0xdf)
 		{
 			/* JIS X0201 (1 byte kana) */
@@ -680,4 +767,6 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
index ac823d6c270..3b85f0c1861 100644
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_kr_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_MULE_INTERNAL);
 
-	euc_kr2mic(src, dest, len);
+	converted = euc_kr2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_KR);
 
-	mic2euc_kr(src, dest, len);
+	converted = mic2euc_kr(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_KR ---> MIC
  */
-static void
-euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -78,8 +86,12 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = LC_KS5601;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -89,22 +101,29 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_KR
  */
-static void
-mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -115,8 +134,12 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -124,18 +147,28 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_KS5601)
 		{
 			*p++ = mic[1];
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
index 66c242d7f36..4bf8acda99f 100644
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -32,17 +32,20 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
-static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
-static void big52mic(const unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
+static int	mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -50,12 +53,14 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
 
-	euc_tw2big5(src, dest, len);
+	converted = euc_tw2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -64,12 +69,14 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
 
-	big52euc_tw(src, dest, len);
+	converted = big52euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -78,12 +85,14 @@ euc_tw_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
 
-	euc_tw2mic(src, dest, len);
+	converted = euc_tw2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -92,12 +101,14 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
 
-	mic2euc_tw(src, dest, len);
+	converted = mic2euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -106,12 +117,14 @@ big5_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
 
-	big52mic(src, dest, len);
+	converted = big52mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -120,21 +133,24 @@ mic_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
 
-	mic2big5(src, dest, len);
+	converted = mic2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 
 /*
  * EUC_TW ---> Big5
  */
-static void
-euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	unsigned char c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -149,8 +165,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Verify and decode the next EUC_TW input character */
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -171,8 +191,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Write it out in Big5 */
 			big5buf = CNStoBIG5(cnsBuf, lc);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_EUC_TW, PG_BIG5,
 										   (const char *) euc, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 
@@ -182,22 +206,29 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * Big5 ---> EUC_TW
  */
-static void
-big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -212,8 +243,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			big5buf = (c1 << 8) | big5[1];
 			cnsBuf = BIG5toCNS(big5buf, &lc);
 
@@ -237,8 +272,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 				*p++ = cnsBuf & 0x00ff;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_BIG5, PG_EUC_TW,
 										   (const char *) big5, len);
+			}
 
 			big5 += l;
 			len -= l;
@@ -256,14 +295,17 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		}
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * EUC_TW ---> MIC
  */
-static void
-euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -274,8 +316,12 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -304,22 +350,29 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_TW
  */
-static void
-mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -330,8 +383,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -339,8 +396,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1)
 		{
 			*p++ = mic[1];
@@ -362,20 +423,27 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[3];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * Big5 ---> MIC
  */
-static void
-big52mic(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -389,8 +457,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			*p++ = c1;
 			big5++;
 			len--;
@@ -398,8 +470,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_BIG5,
 									(const char *) big5, len);
+		}
 		big5buf = (c1 << 8) | big5[1];
 		cnsBuf = BIG5toCNS(big5buf, &lc);
 		if (lc != 0)
@@ -412,20 +488,27 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 			*p++ = cnsBuf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
 									   (const char *) big5, len);
+		}
 		big5 += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * MIC ---> Big5
  */
-static void
-mic2big5(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -438,8 +521,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -447,8 +534,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
 		{
 			if (c1 == LCPRV2_B)
@@ -462,16 +553,26 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 			}
 			big5buf = CNStoBIG5(cnsBuf, c1);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 										   (const char *) mic, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
index 2e28e6780a5..8610fcb69aa 100644
--- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
+++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(win1250_to_latin2);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -82,12 +85,14 @@ latin2_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -96,12 +101,14 @@ mic_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN2);
 
-	mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -110,13 +117,15 @@ win1250_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 win1250_2_iso88592);
+	converted = latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -125,13 +134,15 @@ mic_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1250);
 
-	mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 iso88592_2_win1250);
+	converted = mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -140,12 +151,15 @@ latin2_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_WIN1250);
 
-	local2local(src, dest, len, PG_LATIN2, PG_WIN1250, iso88592_2_win1250);
+	converted = local2local(src, dest, len, PG_LATIN2, PG_WIN1250,
+							iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -154,10 +168,13 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_LATIN2);
 
-	local2local(src, dest, len, PG_WIN1250, PG_LATIN2, win1250_2_iso88592);
+	converted = local2local(src, dest, len, PG_WIN1250, PG_LATIN2,
+							win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
index bc651410f21..bff27d1c295 100644
--- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(mic_to_latin4);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -42,12 +45,14 @@ latin1_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,12 +61,14 @@ mic_to_latin1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN1);
 
-	mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -70,12 +77,14 @@ latin3_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN3, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -84,12 +93,14 @@ mic_to_latin3(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN3);
 
-	mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,12 +109,14 @@ latin4_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN4, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -112,10 +125,12 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN4);
 
-	mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
index d6067cdc24e..3838b15cab9 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ big5_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &big5_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = LocalToUtf(src, len, dest,
+						   &big5_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5);
 
-	UtfToLocal(src, len, dest,
-			   &big5_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = UtfToLocal(src, len, dest,
+						   &big5_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
index ed90e8e682e..75719fe5f1b 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
@@ -33,8 +33,11 @@ PG_FUNCTION_INFO_V1(koi8u_to_utf8);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -44,16 +47,19 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R);
 
-	UtfToLocal(src, len, dest,
-			   &koi8r_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8r_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -62,16 +68,19 @@ koi8r_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8r_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8r_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -80,16 +89,19 @@ utf8_to_koi8u(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U);
 
-	UtfToLocal(src, len, dest,
-			   &koi8u_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8u_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,14 +110,17 @@ koi8u_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8u_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8u_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
index d699affce47..5391001951a 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jis_2004_to_unicode_tree,
-			   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jis_2004_to_unicode_tree,
+						   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jis_2004_from_unicode_tree,
-			   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jis_2004_from_unicode_tree,
+						   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
index d7c0ba6a58b..c87d1bf2398 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_cn_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_cn_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN);
 
-	UtfToLocal(src, len, dest,
-			   &euc_cn_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_cn_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
index 13a3a23e77b..6a55134db21 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jp);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jp_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jp_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jp_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jp_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
index 1bbb8aaef7b..fe1924e2fec 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_kr_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_kr_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR);
 
-	UtfToLocal(src, len, dest,
-			   &euc_kr_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_kr_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
index 9830045dccd..68215659b57 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_tw);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_tw_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_tw_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW);
 
-	UtfToLocal(src, len, dest,
-			   &euc_tw_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_tw_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index f86ecf27424..e1a59c39a4d 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -183,8 +183,11 @@ conv_utf8_to_18030(uint32 code)
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -193,16 +196,19 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gb18030_to_unicode_tree,
-			   NULL, 0,
-			   conv_18030_to_utf8,
-			   PG_GB18030);
+	converted = LocalToUtf(src, len, dest,
+						   &gb18030_to_unicode_tree,
+						   NULL, 0,
+						   conv_18030_to_utf8,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -211,14 +217,17 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, len, dest,
-			   &gb18030_from_unicode_tree,
-			   NULL, 0,
-			   conv_utf8_to_18030,
-			   PG_GB18030);
+	converted = UtfToLocal(src, len, dest,
+						   &gb18030_from_unicode_tree,
+						   NULL, 0,
+						   conv_utf8_to_18030,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
index 2ab8b16c8a8..881386d5347 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_gbk);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ gbk_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gbk_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = LocalToUtf(src, len, dest,
+						   &gbk_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK);
 
-	UtfToLocal(src, len, dest,
-			   &gbk_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = UtfToLocal(src, len, dest,
+						   &gbk_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
index 3e49f67ea2f..d93a521badf 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
@@ -52,8 +52,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -100,6 +103,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -108,12 +112,15 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -122,7 +129,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -132,6 +139,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -140,12 +148,15 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -154,5 +165,5 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
index 67e713cca11..d0dc4cca378 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
@@ -26,8 +26,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -37,6 +40,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
@@ -45,7 +50,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+		}
 		if (!IS_HIGHBIT_SET(c))
 			*dest++ = c;
 		else
@@ -58,7 +67,7 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
 
 Datum
@@ -67,6 +76,8 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c,
 				c1;
 
@@ -76,7 +87,11 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_UTF8, (const char *) src, len);
+		}
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(c))
 		{
@@ -89,10 +104,18 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 			int			l = pg_utf_mblen(src);
 
 			if (l > len || !pg_utf8_islegal(src, l))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_UTF8, (const char *) src, len);
+			}
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 			c1 = src[1] & 0x3f;
 			c = ((c & 0x1f) << 6) | c1;
 			if (c >= 0x80 && c <= 0xff)
@@ -102,11 +125,15 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 				len -= 2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 		}
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
index 578f5df4e7f..317daa2d5ee 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_johab);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ johab_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &johab_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = LocalToUtf(src, len, dest,
+						   &johab_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_johab(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB);
 
-	UtfToLocal(src, len, dest,
-			   &johab_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = UtfToLocal(src, len, dest,
+						   &johab_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
index dd9fc2975ad..4c9348aba59 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ sjis_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &sjis_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = LocalToUtf(src, len, dest,
+						   &sjis_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS);
 
-	UtfToLocal(src, len, dest,
-			   &sjis_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = UtfToLocal(src, len, dest,
+						   &sjis_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
index 4bcc886d674..1fffdc5930c 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_shift_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &shift_jis_2004_to_unicode_tree,
-			   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &shift_jis_2004_to_unicode_tree,
+						   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &shift_jis_2004_from_unicode_tree,
-			   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &shift_jis_2004_from_unicode_tree,
+						   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
index c8e512994a1..d9471dad097 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_uhc);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ uhc_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &uhc_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = LocalToUtf(src, len, dest,
+						   &uhc_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC);
 
-	UtfToLocal(src, len, dest,
-			   &uhc_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = UtfToLocal(src, len, dest,
+						   &uhc_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
index 0c9493dee56..110ba5677d0 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
@@ -48,8 +48,11 @@ PG_FUNCTION_INFO_V1(utf8_to_win);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -81,6 +84,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -89,12 +93,15 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -103,7 +110,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -113,6 +120,7 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -121,12 +129,15 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -135,5 +146,5 @@ utf8_to_win(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 2578573b0ab..877d57eee53 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	OidFunctionCall5(proc,
-					 Int32GetDatum(src_encoding),
-					 Int32GetDatum(dest_encoding),
-					 CStringGetDatum(src),
-					 CStringGetDatum(result),
-					 Int32GetDatum(len));
+	(void) OidFunctionCall6(proc,
+							Int32GetDatum(src_encoding),
+							Int32GetDatum(dest_encoding),
+							CStringGetDatum(src),
+							CStringGetDatum(result),
+							Int32GetDatum(len),
+							BoolGetDatum(false));
 
 	/*
 	 * If the result is large, it's worth repalloc'ing to release any extra
@@ -435,6 +436,59 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 	return result;
 }
 
+/*
+ * Convert src string to another encoding.
+ *
+ * This function has a different API than the other conversion functions.
+ * The caller should've looked up the conversion function using
+ * FindDefaultConversionProc(). Unlike the other functions, the converted
+ * result is not palloc'd. It is written to a caller-supplied buffer instead.
+ *
+ * src_encoding   - encoding to convert from
+ * dest_encoding  - encoding to convert to
+ * src, srclen    - input buffer and its length in bytes
+ * dest, destlen  - destination buffer and its size in bytes
+ *
+ * The output is null-terminated.
+ *
+ * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
+ * wouldn't necessarily fit in the output buffer, and the function will not
+ * convert the whole input.
+ *
+ * TODO: It would be nice to also return the number of bytes written to the
+ * caller, to avoid a call to strlen().
+ */
+int
+pg_do_encoding_conversion_buf(Oid proc,
+							  int src_encoding,
+							  int dest_encoding,
+							  unsigned char *src, int srclen,
+							  unsigned char *dest, int destlen,
+							  bool noError)
+{
+	Datum		result;
+
+	/*
+	 * If the destination buffer is not large enough to hold the result in the
+	 * worst case, limit the input size passed to the conversion function.
+	 *
+	 * TODO: It would perhaps be more efficient to pass the destination buffer
+	 * size to the conversion function, so that if the conversion expands less
+	 * than the worst case, it could continue to fill up the whole buffer.
+	 */
+	if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
+		srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
+
+	result = OidFunctionCall6(proc,
+							  Int32GetDatum(src_encoding),
+							  Int32GetDatum(dest_encoding),
+							  CStringGetDatum(src),
+							  CStringGetDatum(dest),
+							  Int32GetDatum(srclen),
+							  BoolGetDatum(noError));
+	return DatumGetInt32(result);
+}
+
 /*
  * Convert string to encoding encoding_name. The source
  * encoding is the DB encoding.
@@ -762,12 +816,13 @@ perform_default_encoding_conversion(const char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	FunctionCall5(flinfo,
+	FunctionCall6(flinfo,
 				  Int32GetDatum(src_encoding),
 				  Int32GetDatum(dest_encoding),
 				  CStringGetDatum(src),
 				  CStringGetDatum(result),
-				  Int32GetDatum(len));
+				  Int32GetDatum(len),
+				  BoolGetDatum(false));
 
 	/*
 	 * Release extra space if there might be a lot --- see comments in
@@ -849,12 +904,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
 	c_as_utf8[c_as_utf8_len] = '\0';
 
 	/* Convert, or throw error if we can't */
-	FunctionCall5(Utf8ToServerConvProc,
+	FunctionCall6(Utf8ToServerConvProc,
 				  Int32GetDatum(PG_UTF8),
 				  Int32GetDatum(server_encoding),
 				  CStringGetDatum(c_as_utf8),
 				  CStringGetDatum(s),
-				  Int32GetDatum(c_as_utf8_len));
+				  Int32GetDatum(c_as_utf8_len),
+				  BoolGetDatum(false));
 }
 
 
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 43fc297eb69..ee6be95b08d 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -28,6 +28,7 @@ static void check_for_reg_data_type_usage(ClusterInfo *cluster);
 static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(ClusterInfo *new_cluster);
+static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
 static char *get_canonical_locale_name(int category, const char *locale);
 
 
@@ -102,6 +103,15 @@ check_and_dump_old_cluster(bool live_check)
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
 
+	/*
+	 * PG 14 changed the function signature of encoding conversion functions.
+	 * Conversions from older versions cannot be upgraded automatically
+	 * because the user-defined functions used by the encoding conversions
+	 * need to changed to match the new signature.
+	 */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1300)
+		check_for_user_defined_encoding_conversions(&old_cluster);
+
 	/*
 	 * Pre-PG 14 allowed user defined postfix operators, which are not
 	 * supported anymore.  Verify there are none, iff applicable.
@@ -1268,6 +1278,91 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
 	check_ok();
 }
 
+/*
+ * Verify that no user-defined encoding conversions exist.
+ */
+static void
+check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
+{
+	int			dbnum;
+	FILE	   *script = NULL;
+	bool		found = false;
+	char		output_path[MAXPGPATH];
+
+	prep_status("Checking for user-defined encoding conversions");
+
+	snprintf(output_path, sizeof(output_path),
+			 "encoding_conversions.txt");
+
+	/* Find any user defined encoding conversions */
+	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
+	{
+		PGresult   *res;
+		bool		db_used = false;
+		int			ntups;
+		int			rowno;
+		int			i_conoid,
+					i_conname,
+					i_nspname;
+		DbInfo	   *active_db = &cluster->dbarr.dbs[dbnum];
+		PGconn	   *conn = connectToServer(cluster, active_db->db_name);
+
+		/*
+		 * The query below hardcodes FirstNormalObjectId as 16384 rather than
+		 * interpolating that C #define into the query because, if that
+		 * #define is ever changed, the cutoff we want to use is the value
+		 * used by pre-version 14 servers, not that of some future version.
+		 */
+		res = executeQueryOrDie(conn,
+								"SELECT c.oid as conoid, c.conname, n.nspname "
+								"FROM pg_catalog.pg_conversion c, "
+								"     pg_catalog.pg_namespace n "
+								"WHERE c.connamespace = n.oid AND "
+								"      c.oid >= 16384");
+		ntups = PQntuples(res);
+		i_conoid = PQfnumber(res, "conoid");
+		i_conname = PQfnumber(res, "conname");
+		i_nspname = PQfnumber(res, "nspname");
+		for (rowno = 0; rowno < ntups; rowno++)
+		{
+			found = true;
+			if (script == NULL &&
+				(script = fopen_priv(output_path, "w")) == NULL)
+				pg_fatal("could not open file \"%s\": %s\n",
+						 output_path, strerror(errno));
+			if (!db_used)
+			{
+				fprintf(script, "In database: %s\n", active_db->db_name);
+				db_used = true;
+			}
+			fprintf(script, "  (oid=%s) %s.%s\n",
+					PQgetvalue(res, rowno, i_conoid),
+					PQgetvalue(res, rowno, i_nspname),
+					PQgetvalue(res, rowno, i_conname));
+		}
+
+		PQclear(res);
+
+		PQfinish(conn);
+	}
+
+	if (script)
+		fclose(script);
+
+	if (found)
+	{
+		pg_log(PG_REPORT, "fatal\n");
+		pg_fatal("Your installation contains user-defined encoding conversions.\n"
+				 "The conversion function parameters changed in PostgreSQL version 14\n"
+				 "so this cluster cannot currently be upgraded.  You can remove the\n"
+				 "encoding conversions in the old cluster and restart the upgrade.\n"
+				 "A list of user-defined encoding conversions is in the file:\n"
+				 "    %s\n\n", output_path);
+	}
+	else
+		check_ok();
+}
+
 
 /*
  * get_canonical_locale_name
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4e0c9be58c3..12e228c7e06 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10774,388 +10774,388 @@
 # conversion functions
 { oid => '4302',
   descr => 'internal conversion function for KOI8R to MULE_INTERNAL',
-  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_mic',
+  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4303',
   descr => 'internal conversion function for MULE_INTERNAL to KOI8R',
-  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_koi8r',
+  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4304',
   descr => 'internal conversion function for ISO-8859-5 to MULE_INTERNAL',
-  proname => 'iso_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_mic',
+  proname => 'iso_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4305',
   descr => 'internal conversion function for MULE_INTERNAL to ISO-8859-5',
-  proname => 'mic_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_iso',
+  proname => 'mic_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4306',
   descr => 'internal conversion function for WIN1251 to MULE_INTERNAL',
-  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_mic',
+  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4307',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1251',
-  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1251',
+  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4308',
   descr => 'internal conversion function for WIN866 to MULE_INTERNAL',
-  proname => 'win866_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_mic',
+  proname => 'win866_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4309',
   descr => 'internal conversion function for MULE_INTERNAL to WIN866',
-  proname => 'mic_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win866',
+  proname => 'mic_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4310', descr => 'internal conversion function for KOI8R to WIN1251',
-  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'koi8r_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4311', descr => 'internal conversion function for WIN1251 to KOI8R',
-  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_koi8r', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4312', descr => 'internal conversion function for KOI8R to WIN866',
-  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_win866',
+  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4313', descr => 'internal conversion function for WIN866 to KOI8R',
-  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_koi8r',
+  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4314',
   descr => 'internal conversion function for WIN866 to WIN1251',
-  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win866_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4315',
   descr => 'internal conversion function for WIN1251 to WIN866',
-  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_win866', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4316',
   descr => 'internal conversion function for ISO-8859-5 to KOI8R',
-  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_koi8r',
+  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4317',
   descr => 'internal conversion function for KOI8R to ISO-8859-5',
-  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_iso',
+  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4318',
   descr => 'internal conversion function for ISO-8859-5 to WIN1251',
-  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win1251',
+  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4319',
   descr => 'internal conversion function for WIN1251 to ISO-8859-5',
-  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_iso',
+  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4320',
   descr => 'internal conversion function for ISO-8859-5 to WIN866',
-  proname => 'iso_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win866',
+  proname => 'iso_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4321',
   descr => 'internal conversion function for WIN866 to ISO-8859-5',
-  proname => 'win866_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_iso',
+  proname => 'win866_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4322',
   descr => 'internal conversion function for EUC_CN to MULE_INTERNAL',
-  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_mic',
+  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_mic',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4323',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_CN',
-  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_cn',
+  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_cn',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4324', descr => 'internal conversion function for EUC_JP to SJIS',
-  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_sjis',
+  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4325', descr => 'internal conversion function for SJIS to EUC_JP',
-  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_euc_jp',
+  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4326',
   descr => 'internal conversion function for EUC_JP to MULE_INTERNAL',
-  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_mic',
+  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4327',
   descr => 'internal conversion function for SJIS to MULE_INTERNAL',
-  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_mic',
+  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4328',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_JP',
-  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_jp',
+  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4329',
   descr => 'internal conversion function for MULE_INTERNAL to SJIS',
-  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_sjis',
+  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4330',
   descr => 'internal conversion function for EUC_KR to MULE_INTERNAL',
-  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_mic',
+  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_mic',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4331',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_KR',
-  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_kr',
+  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_kr',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4332', descr => 'internal conversion function for EUC_TW to BIG5',
-  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_big5',
+  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4333', descr => 'internal conversion function for BIG5 to EUC_TW',
-  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_euc_tw',
+  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4334',
   descr => 'internal conversion function for EUC_TW to MULE_INTERNAL',
-  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_mic',
+  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4335',
   descr => 'internal conversion function for BIG5 to MULE_INTERNAL',
-  proname => 'big5_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_mic',
+  proname => 'big5_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4336',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_TW',
-  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_tw',
+  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4337',
   descr => 'internal conversion function for MULE_INTERNAL to BIG5',
-  proname => 'mic_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_big5',
+  proname => 'mic_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4338',
   descr => 'internal conversion function for LATIN2 to MULE_INTERNAL',
-  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin2_to_mic',
+  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4339',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN2',
-  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin2',
+  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin2',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4340',
   descr => 'internal conversion function for WIN1250 to MULE_INTERNAL',
-  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1250_to_mic',
+  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4341',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1250',
-  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1250',
+  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1250',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4342',
   descr => 'internal conversion function for LATIN2 to WIN1250',
-  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'latin2_to_win1250', probin => '$libdir/latin2_and_win1250' },
 { oid => '4343',
   descr => 'internal conversion function for WIN1250 to LATIN2',
-  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1250_to_latin2', probin => '$libdir/latin2_and_win1250' },
 { oid => '4344',
   descr => 'internal conversion function for LATIN1 to MULE_INTERNAL',
-  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin1_to_mic',
+  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin1_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4345',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN1',
-  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin1',
+  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin1',
   probin => '$libdir/latin_and_mic' },
 { oid => '4346',
   descr => 'internal conversion function for LATIN3 to MULE_INTERNAL',
-  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin3_to_mic',
+  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin3_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4347',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN3',
-  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin3',
+  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin3',
   probin => '$libdir/latin_and_mic' },
 { oid => '4348',
   descr => 'internal conversion function for LATIN4 to MULE_INTERNAL',
-  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin4_to_mic',
+  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin4_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4349',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN4',
-  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin4',
+  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin4',
   probin => '$libdir/latin_and_mic' },
 { oid => '4352', descr => 'internal conversion function for BIG5 to UTF8',
-  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_utf8',
+  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_utf8',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4353', descr => 'internal conversion function for UTF8 to BIG5',
-  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_big5',
+  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_big5',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4354', descr => 'internal conversion function for UTF8 to KOI8R',
-  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8r',
+  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8r',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4355', descr => 'internal conversion function for KOI8R to UTF8',
-  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_utf8',
+  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4356', descr => 'internal conversion function for UTF8 to KOI8U',
-  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8u',
+  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8u',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4357', descr => 'internal conversion function for KOI8U to UTF8',
-  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8u_to_utf8',
+  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8u_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4358', descr => 'internal conversion function for UTF8 to WIN',
-  proname => 'utf8_to_win', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_win',
+  proname => 'utf8_to_win', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_win',
   probin => '$libdir/utf8_and_win' },
 { oid => '4359', descr => 'internal conversion function for WIN to UTF8',
-  proname => 'win_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win_to_utf8',
+  proname => 'win_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win_to_utf8',
   probin => '$libdir/utf8_and_win' },
 { oid => '4360', descr => 'internal conversion function for EUC_CN to UTF8',
-  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_utf8',
+  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_utf8',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4361', descr => 'internal conversion function for UTF8 to EUC_CN',
-  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_cn',
+  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_cn',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4362', descr => 'internal conversion function for EUC_JP to UTF8',
-  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_utf8',
+  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_utf8',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4363', descr => 'internal conversion function for UTF8 to EUC_JP',
-  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_jp',
+  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jp',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4364', descr => 'internal conversion function for EUC_KR to UTF8',
-  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_utf8',
+  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_utf8',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4365', descr => 'internal conversion function for UTF8 to EUC_KR',
-  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_kr',
+  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_kr',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4366', descr => 'internal conversion function for EUC_TW to UTF8',
-  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_utf8',
+  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_utf8',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4367', descr => 'internal conversion function for UTF8 to EUC_TW',
-  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_tw',
+  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_tw',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4368', descr => 'internal conversion function for GB18030 to UTF8',
-  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gb18030_to_utf8',
+  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gb18030_to_utf8',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4369', descr => 'internal conversion function for UTF8 to GB18030',
-  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gb18030',
+  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gb18030',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4370', descr => 'internal conversion function for GBK to UTF8',
-  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gbk_to_utf8',
+  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gbk_to_utf8',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4371', descr => 'internal conversion function for UTF8 to GBK',
-  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gbk',
+  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gbk',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4372',
   descr => 'internal conversion function for UTF8 to ISO-8859 2-16',
-  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_iso8859',
+  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4373',
   descr => 'internal conversion function for ISO-8859 2-16 to UTF8',
-  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso8859_to_utf8',
+  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_to_utf8',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4374', descr => 'internal conversion function for LATIN1 to UTF8',
-  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'iso8859_1_to_utf8', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4375', descr => 'internal conversion function for UTF8 to LATIN1',
-  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_iso8859_1', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4376', descr => 'internal conversion function for JOHAB to UTF8',
-  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'johab_to_utf8',
+  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'johab_to_utf8',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4377', descr => 'internal conversion function for UTF8 to JOHAB',
-  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_johab',
+  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_johab',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4378', descr => 'internal conversion function for SJIS to UTF8',
-  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_utf8',
+  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_utf8',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4379', descr => 'internal conversion function for UTF8 to SJIS',
-  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_sjis',
+  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_sjis',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4380', descr => 'internal conversion function for UHC to UTF8',
-  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'uhc_to_utf8',
+  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'uhc_to_utf8',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4381', descr => 'internal conversion function for UTF8 to UHC',
-  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_uhc',
+  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_uhc',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4382',
   descr => 'internal conversion function for EUC_JIS_2004 to UTF8',
-  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_utf8', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4383',
   descr => 'internal conversion function for UTF8 to EUC_JIS_2004',
-  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_euc_jis_2004', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4384',
   descr => 'internal conversion function for SHIFT_JIS_2004 to UTF8',
-  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_utf8', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4385',
   descr => 'internal conversion function for UTF8 to SHIFT_JIS_2004',
-  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_shift_jis_2004', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4386',
   descr => 'internal conversion function for EUC_JIS_2004 to SHIFT_JIS_2004',
   proname => 'euc_jis_2004_to_shift_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_shift_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 { oid => '4387',
   descr => 'internal conversion function for SHIFT_JIS_2004 to EUC_JIS_2004',
   proname => 'shift_jis_2004_to_euc_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_euc_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 64b22e4b0d4..bbce9071dfc 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -616,6 +616,12 @@ extern int	pg_bind_textdomain_codeset(const char *domainname);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 												int src_encoding,
 												int dest_encoding);
+extern int	pg_do_encoding_conversion_buf(Oid proc,
+										  int src_encoding,
+										  int dest_encoding,
+										  unsigned char *src, int srclen,
+										  unsigned char *dst, int dstlen,
+										  bool noError);
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
@@ -627,18 +633,18 @@ extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void UtfToLocal(const unsigned char *utf, int len,
+extern int	UtfToLocal(const unsigned char *utf, int len,
 					   unsigned char *iso,
 					   const pg_mb_radix_tree *map,
 					   const pg_utf_to_local_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
-extern void LocalToUtf(const unsigned char *iso, int len,
+					   int encoding, bool noError);
+extern int	LocalToUtf(const unsigned char *iso, int len,
 					   unsigned char *utf,
 					   const pg_mb_radix_tree *map,
 					   const pg_local_to_utf_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
+					   int encoding, bool noError);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
@@ -656,18 +662,19 @@ extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg
 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
 									   const char *mbstr, int len) pg_attribute_noreturn();
 
-extern void local2local(const unsigned char *l, unsigned char *p, int len,
-						int src_encoding, int dest_encoding, const unsigned char *tab);
-extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+extern int	local2local(const unsigned char *l, unsigned char *p, int len,
+						int src_encoding, int dest_encoding, const unsigned char *tab,
+						bool noError);
+extern int	latin2mic(const unsigned char *l, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	mic2latin(const unsigned char *mic, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	latin2mic_with_table(const unsigned char *l, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
-extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+								 const unsigned char *tab, bool noError);
+extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
+								 const unsigned char *tab, bool noError);
 
 #ifdef WIN32
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 62c10671685..e34ab20974d 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -37,3 +37,522 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+--
+-- Test built-in conversion functions.
+--
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+                     description                      |        result        |   errorat    |                           error                           
+------------------------------------------------------+----------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382ab             | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
+ invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+                     description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
+------------------------------------------------------+----------------------+----------------+----------------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f       |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xa9daa9ec     |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0 |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fbedd   |                      | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5f7         |                      | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab         |                      | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x             | \xe382abe382         | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x             | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x       | \xc3a4c3b6           | character with byte sequence 0xc3 0xa4 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, cyrillic                                      | \xd184d0bed0be       | \xc6cfcf |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+                     description                      |       inbytes        |           result           |   errorat    |                           error                           
+------------------------------------------------------+----------------------+----------------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f                   |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x81308a3181308b32         |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0             |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fcff3               |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5ab8139a732             |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab                     |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \xa5ab                     | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x8334e5398238c4338330b335 |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                            error                             
+---------------------------------------+----------------+--------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fbedd |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xa5f7       |          | 
+ incomplete char                       | \xbeddbe       | \xbedd       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f     | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f     | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x           | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |     result     | errorat  |                            error                             
+---------------------------------------+----------------+----------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f       |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fe8b1a1 |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xe382abe3829a |          | 
+ incomplete char                       | \xbeddbe       | \xe8b1a1       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f       | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f       | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x             | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6f8fdb |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6f81c0 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6f82f5 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6f8fdb | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6f8fdb | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+              description              |    inbytes     |        result        | errorat  |                             error                              
+---------------------------------------+----------------+----------------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f             |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fe8b1a1       |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fe28a84       |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fe3818be3829a |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fe8b1a1       | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f             | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f             | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f             | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fe8b1a1       | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fbedd |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fa2c2 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fa4f7 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fbedd | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+                  description                   |      inbytes       |      result      |   errorat    |                               error                               
+------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fcff3     |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f8431a530 |              | 
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+                  description                   |      inbytes       |     result     |   errorat    |                                                    error                                                    
+------------------------------------------------+--------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f       |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fe8b1a1 |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f       | \x8431a530   | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8"
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xe4dede |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xe4     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xe4dede | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \xd184d0bed0be |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xd184         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xd184d0bed0be | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xc6cfcf |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xc6     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xc6cfcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \x8bc68bcf8bcf |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \x8bc6         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \x8bc68bcf8bcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+          description           |    inbytes     |    result    | errorat  |                        error                         
+--------------------------------+----------------+--------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f     |          | 
+ valid                          | \x666f6fb648   | \x666f6fb648 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6fa27f |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f     | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fb648 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                                             error                                              
+--------------------------------+----------------+----------------+----------+------------------------------------------------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6fe8b1a1 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f       | \xa27f   | character with byte sequence 0xa2 0x7f in encoding "BIG5" has no equivalent in encoding "UTF8"
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                        error                         
+--------------------------------+----------------+----------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6f95e2af |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f95a3c1 |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6f95e2af | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+        description        |     inbytes      |     result     |     errorat      |                               error                                
+---------------------------+------------------+----------------+------------------+--------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f       |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x8bc68bcf8bcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \x8bc68bcf     | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x92bedd       |                  | 
+ invalid, incomplete char) | \x92be           | \x             | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f95a3c1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f       | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x             | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x92bedd       | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x             | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                     
+---------------------------+------------------+----------+------------------+---------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xc6cfcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xc6cf   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                       error                                                        
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xe4dede |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xe4de   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                    error                                                     
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x8fdb   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x8fdb   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+        description        |     inbytes      |    result    |     errorat      |                                                    error                                                     
+---------------------------+------------------+--------------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f     |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x           | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x           | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x           | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, incomplete char) | \x92be           | \x           | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6fa2a1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f     | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x           | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x           | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x           | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                      
+---------------------------+------------------+----------+------------------+----------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ valid (in SHIFT_JIS)      | \x92bedd         | \xbedd   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \xbedd   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 254ca06d3dd..23ba60e395f 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1052,13 +1052,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
  oid | proname | oid | conname 
 -----+---------+-----+---------
 (0 rows)
diff --git a/src/test/regress/input/create_function_1.source b/src/test/regress/input/create_function_1.source
index 412e339fcf2..6ba37fe63b6 100644
--- a/src/test/regress/input/create_function_1.source
+++ b/src/test/regress/input/create_function_1.source
@@ -78,6 +78,10 @@ CREATE FUNCTION test_opclass_options_func(internal)
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
 
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
+
 -- Things that shouldn't work:
 
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
diff --git a/src/test/regress/output/create_function_1.source b/src/test/regress/output/create_function_1.source
index 4d78fa12289..cb38a039bf4 100644
--- a/src/test/regress/output/create_function_1.source
+++ b/src/test/regress/output/create_function_1.source
@@ -68,6 +68,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
     RETURNS void
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
 -- Things that shouldn't work:
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
     AS 'SELECT ''not an integer'';';
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 32ab9ed6b53..1990cbb6a13 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -23,12 +23,15 @@
 #include "access/htup_details.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
 #include "commands/sequence.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "executor/spi.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
@@ -1060,3 +1063,134 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_NULL();
 }
+
+/*
+ * Call an encoding conversion or verification function.
+ *
+ * Arguments:
+ *	string	  bytea -- string to convert
+ *	src_enc	  name  -- source encoding
+ *	dest_enc  name  -- destination encoding
+ *	noError	  bool  -- if set, don't ereport() on invalid or untranslatable
+ *					   input
+ *
+ * Result is a tuple with two attributes:
+ *  int4	-- number of input bytes successfully converted
+ *  bytea	-- converted string
+ */
+PG_FUNCTION_INFO_V1(test_enc_conversion);
+Datum
+test_enc_conversion(PG_FUNCTION_ARGS)
+{
+	bytea	   *string = PG_GETARG_BYTEA_PP(0);
+	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+	int			src_encoding = pg_char_to_encoding(src_encoding_name);
+	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
+	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
+	bool		noError = PG_GETARG_BOOL(3);
+	TupleDesc	tupdesc;
+	char	   *src;
+	char	   *dst;
+	bytea	   *retval;
+	Size		srclen;
+	Size		dstsize;
+	Oid			proc;
+	int			convertedbytes;
+	int			dstlen;
+	Datum		values[2];
+	bool		nulls[2];
+	HeapTuple	tuple;
+
+	if (src_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid source encoding name \"%s\"",
+						src_encoding_name)));
+	if (dest_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid destination encoding name \"%s\"",
+						dest_encoding_name)));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	srclen = VARSIZE_ANY_EXHDR(string);
+	src = VARDATA_ANY(string);
+
+	if (src_encoding == dest_encoding)
+	{
+		/* just check that the source string is valid */
+		int			oklen;
+
+		oklen = pg_encoding_verifymbstr(src_encoding, src, srclen);
+
+		if (oklen == srclen)
+		{
+			convertedbytes = oklen;
+			retval = string;
+		}
+		else if (!noError)
+		{
+			report_invalid_encoding(src_encoding, src + oklen, srclen - oklen);
+		}
+		else
+		{
+			/*
+			 * build bytea data type structure.
+			 */
+			Assert(oklen < srclen);
+			convertedbytes = oklen;
+			retval = (bytea *) palloc(oklen + VARHDRSZ);
+			SET_VARSIZE(retval, oklen + VARHDRSZ);
+			memcpy(VARDATA(retval), src, oklen);
+		}
+	}
+	else
+	{
+		proc = FindDefaultConversionProc(src_encoding, dest_encoding);
+		if (!OidIsValid(proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
+							pg_encoding_to_char(src_encoding),
+							pg_encoding_to_char(dest_encoding))));
+
+		if (srclen >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of memory"),
+					 errdetail("String of %d bytes is too long for encoding conversion.",
+							   (int) srclen)));
+
+		dstsize = (Size) srclen * MAX_CONVERSION_GROWTH + 1;
+		dst = MemoryContextAlloc(CurrentMemoryContext, dstsize);
+
+		/* perform conversion */
+		convertedbytes = pg_do_encoding_conversion_buf(proc,
+													   src_encoding,
+													   dest_encoding,
+													   (unsigned char *) src, srclen,
+													   (unsigned char *) dst, dstsize,
+													   noError);
+		dstlen = strlen(dst);
+
+		/*
+		 * build bytea data type structure.
+		 */
+		retval = (bytea *) palloc(dstlen + VARHDRSZ);
+		SET_VARSIZE(retval, dstlen + VARHDRSZ);
+		memcpy(VARDATA(retval), dst, dstlen);
+
+		pfree(dst);
+	}
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = Int32GetDatum(convertedbytes);
+	values[1] = PointerGetDatum(retval);
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
+}
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 02cf39f1ce9..ea85f20ed83 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -34,3 +34,188 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+
+--
+-- Test built-in conversion functions.
+--
+
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+
+
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index bbd3834b634..04691745981 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -556,13 +556,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
 
 -- Check for conprocs that don't perform the specific conversion that
 -- pg_conversion alleges they do, by trying to invoke each conversion
-- 
2.30.0

v5-0002-Use-SSE-4-for-verifying-UTF-8-text.patchapplication/octet-stream; name=v5-0002-Use-SSE-4-for-verifying-UTF-8-text.patchDownload

From 9bdf603e6760422951a4aabb0bf60014094c5055 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sat, 20 Feb 2021 16:30:46 -0400
Subject: [PATCH v5 2/3] Use SSE 4 for verifying UTF-8 text.

Replace pg_utf8_verifystr() with two faster implementations:

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licencsed), but the code is written from scratch using
simdjson as a reference.

On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 116 +++--
 configure.ac                             |  63 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  27 +-
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  87 ++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 129 ++++++
 src/port/pg_utf8_sse42.c                 | 527 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  69 +++
 src/test/regress/expected/conversion.out |  52 +++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1084 insertions(+), 67 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index ce9ea36999..9b66a9f2c0 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17670,14 +17671,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17691,32 +17692,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17730,26 +17734,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -17884,12 +17891,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -17903,7 +17910,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -17956,6 +17963,63 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 07da84d401..4eb4042ea0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2017,14 +2017,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2065,12 +2065,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2084,7 +2084,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2121,6 +2121,53 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 55cab4d2bf..303dae4441 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -905,6 +905,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..62c42f3a8b
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0b11000000) == 0b10000000)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0b11100000) == 0b11000000)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0b11110000) == 0b11100000)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0b11111000) == 0b11110000)
+
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbit_mask;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..9a29d909ef
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..5dd4f0e13c
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors of the above paper provide an implementation of this
+ * algorithm in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The intesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+/*
+ * lookup tables for classifying two-byte sequences
+ *
+ * The corresponding bit patterns are:
+ *
+ * TOO_SHORT
+ * 11______ 0_______
+ * 11______ 11______
+ *
+ * TOO_LONG
+ * 0_______ 10______
+ *
+ * OVERLONG_2
+ * 1100000_ 10______
+ *
+ * OVERLONG_3
+ * 11100000 100_____
+ *
+ * OVERLONG_4
+ * 11110000 1000____
+ *
+ * TOO_LARGE_1000
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ *
+ * TOO_LARGE
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ *
+ * SURROGATE
+ * 11101101 101_____
+ *
+ * TWO_CONTS
+ * 10______ 10______
+ */
+
+#define TOO_SHORT		(1 << 0)
+#define TOO_LONG		(1 << 1)
+#define OVERLONG_2		(1 << 2)
+#define OVERLONG_3		(1 << 3)
+/* The following two symbols intentionally share one value. */
+#define OVERLONG_4		(1 << 4)
+#define TOO_LARGE_1000	(1 << 4)
+#define TOO_LARGE		(1 << 5)
+#define SURROGATE		(1 << 6)
+/*
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits. We can't use
+ * uint8, because _mm_setr_epi8 expects signed chars.
+ */
+#define TWO_CONTS (char) (1 << 7)
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+/* helper functions to wrap intrinsics */
+
+/* return a zeroed vector */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory and return the register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(uint8 byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* Bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/*
+ * Shift entire "input" register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * "prev" register. Can be stated in C thusly:
+ *
+ * (prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup register as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/* check if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	_mm_setr_epi8(PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX,
+				  PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX,
+				  PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX, PG_UINT8_MAX,
+				  PG_UINT8_MAX, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Return a bitmap of locations where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = _mm_setr_epi8(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = _mm_setr_epi8(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = _mm_setr_epi8(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero values in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a mask of locations where we expect to find two continuations
+ * in a row. These can only be found in 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/* create registers that are shifted up by 2 and 3 bytes */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+	const		__m128i max_lead2 = splat(0b11100000u - 1);
+	const		__m128i max_lead3 = splat(0b11110000u - 1);
+
+	/*
+	 * Look in the shifted registers for valid 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits for the current byte if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * greater_than() sets all bits in the result when true. We want to
+	 * compare with the result of the classifier so apply a mask to match only
+	 * the set bit found in the "two continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		error = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. If the current chunk is ASCII, we don't need
+		 * to update prev_incomplete since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		unsigned char inbuf[sizeof(__m128i)];
+
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..263b840150
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 49614106dc..a92350d009 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -112,10 +112,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 2aa062b2c9..236c0a857b 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -489,6 +489,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => undef,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.22.0

v5-0003-Add-an-ASCII-fast-path-to-non-UTF-8-encoding-veri.patchapplication/octet-stream; name=v5-0003-Add-an-ASCII-fast-path-to-non-UTF-8-encoding-veri.patchDownload

From 110c570cdd3cd695d4b24d6aae38e7cf4d68507f Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sat, 20 Feb 2021 16:32:21 -0400
Subject: [PATCH v5 3/3] Add an ASCII fast path to non-UTF-8 encoding
 verification functions.

---
 src/common/wchar.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 37c4d4489b..9407d0a79a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1190,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1248,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1331,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1384,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1434,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1503,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1552,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1601,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1650,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1710,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-- 
2.22.0

#17

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#16)

4 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

The cfbot reported a build failure on Windows because of the use of binary
literals. I've turned those into hex for v6, so let's see how far it gets
now.

I also decided to leave out the patch that adds an ascii fast path to
non-UTF-8 encodings. That would really require more testing than I have
time for.

As before, 0001 is v4 of Heikk's noError conversion patch, whose
regressions tests I build upon.

0002 has no ascii fast path in the fallback implementation. 0003 and 0004
add it back in using 8- and 16-byte strides, respectively. That will make
it easier to test on non-Intel platforms, so we can decide which way to go
here. Also did a round of editing the comments in the SSE4.2 file.

I ran the multibyte conversion regression test found in the message below,
and it passed. That doesn't test UTF-8 explicitly, but all conversions
round-trip through UTF-8, so it does get some coverage.

/messages/by-id/b9e3167f-f84b-7aa4-5738-be578a4db924@iki.fi
--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v6-0001-Add-noError-argument-to-encoding-conversion-funct.patchapplication/octet-stream; name=v6-0001-Add-noError-argument-to-encoding-conversion-funct.patchDownload

From 0688a7cfda94b9b85891954fd5899b2a330f266f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Sun, 7 Feb 2021 17:10:12 +0200
Subject: [PATCH v6 1/4] Add 'noError' argument to encoding conversion
 functions.

With the 'noError' argument, you can try to convert a buffer without
knowing the character boundaries beforehand. The functions now need to
return the number of input bytes successfully converted.

This is is a backwards-incompatible change, if you have created a custom
encoding conversion with CREATE CONVERSION. This adds a check to
pg_upgrade for that, refusing the upgrade if there are any user-defined
encoding conversions.

Add regression tests for built-in encoding conversions. This doesn't cover
every conversion, but it covers all the internal functions in conv.c that
are used to implement the conversions.
---
 doc/src/sgml/ref/create_conversion.sgml       |   5 +-
 src/backend/commands/conversioncmds.c         |  30 +-
 src/backend/utils/error/elog.c                |   2 +
 src/backend/utils/mb/conv.c                   | 139 ++++-
 .../cyrillic_and_mic/cyrillic_and_mic.c       | 127 +++--
 .../euc2004_sjis2004/euc2004_sjis2004.c       |  94 +++-
 .../euc_cn_and_mic/euc_cn_and_mic.c           |  57 +-
 .../euc_jp_and_sjis/euc_jp_and_sjis.c         | 153 ++++--
 .../euc_kr_and_mic/euc_kr_and_mic.c           |  57 +-
 .../euc_tw_and_big5/euc_tw_and_big5.c         | 165 ++++--
 .../latin2_and_win1250/latin2_and_win1250.c   |  49 +-
 .../latin_and_mic/latin_and_mic.c             |  43 +-
 .../utf8_and_big5/utf8_and_big5.c             |  37 +-
 .../utf8_and_cyrillic/utf8_and_cyrillic.c     |  67 ++-
 .../utf8_and_euc2004/utf8_and_euc2004.c       |  37 +-
 .../utf8_and_euc_cn/utf8_and_euc_cn.c         |  37 +-
 .../utf8_and_euc_jp/utf8_and_euc_jp.c         |  37 +-
 .../utf8_and_euc_kr/utf8_and_euc_kr.c         |  37 +-
 .../utf8_and_euc_tw/utf8_and_euc_tw.c         |  37 +-
 .../utf8_and_gb18030/utf8_and_gb18030.c       |  37 +-
 .../utf8_and_gbk/utf8_and_gbk.c               |  37 +-
 .../utf8_and_iso8859/utf8_and_iso8859.c       |  43 +-
 .../utf8_and_iso8859_1/utf8_and_iso8859_1.c   |  35 +-
 .../utf8_and_johab/utf8_and_johab.c           |  37 +-
 .../utf8_and_sjis/utf8_and_sjis.c             |  37 +-
 .../utf8_and_sjis2004/utf8_and_sjis2004.c     |  37 +-
 .../utf8_and_uhc/utf8_and_uhc.c               |  37 +-
 .../utf8_and_win/utf8_and_win.c               |  43 +-
 src/backend/utils/mb/mbutils.c                |  76 ++-
 src/bin/pg_upgrade/check.c                    |  95 ++++
 src/include/catalog/pg_proc.dat               | 332 +++++------
 src/include/mb/pg_wchar.h                     |  35 +-
 src/test/regress/expected/conversion.out      | 519 ++++++++++++++++++
 src/test/regress/expected/opr_sanity.out      |   7 +-
 .../regress/input/create_function_1.source    |   4 +
 .../regress/output/create_function_1.source   |   3 +
 src/test/regress/regress.c                    | 134 +++++
 src/test/regress/sql/conversion.sql           | 185 +++++++
 src/test/regress/sql/opr_sanity.sql           |   7 +-
 39 files changed, 2322 insertions(+), 628 deletions(-)

diff --git a/doc/src/sgml/ref/create_conversion.sgml b/doc/src/sgml/ref/create_conversion.sgml
index e7700fecfc..f014a676c8 100644
--- a/doc/src/sgml/ref/create_conversion.sgml
+++ b/doc/src/sgml/ref/create_conversion.sgml
@@ -117,8 +117,9 @@ conv_proc(
     integer,  -- destination encoding ID
     cstring,  -- source string (null terminated C string)
     internal, -- destination (fill with a null terminated C string)
-    integer   -- source string length
-) RETURNS void;
+    integer,  -- source string length
+    boolean   -- if true, don't throw an error if conversion fails
+) RETURNS integer;
 </programlisting></para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c
index f7ff321de7..59e7300020 100644
--- a/src/backend/commands/conversioncmds.c
+++ b/src/backend/commands/conversioncmds.c
@@ -45,8 +45,9 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	const char *from_encoding_name = stmt->for_encoding_name;
 	const char *to_encoding_name = stmt->to_encoding_name;
 	List	   *func_name = stmt->func_name;
-	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID};
+	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID, BOOLOID};
 	char		result[1];
+	Datum		funcresult;
 
 	/* Convert list of names to a name and namespace */
 	namespaceId = QualifiedNameGetCreationNamespace(stmt->conversion_name,
@@ -92,8 +93,8 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	funcoid = LookupFuncName(func_name, sizeof(funcargs) / sizeof(Oid),
 							 funcargs, false);
 
-	/* Check it returns VOID, else it's probably the wrong function */
-	if (get_func_rettype(funcoid) != VOIDOID)
+	/* Check it returns int4, else it's probably the wrong function */
+	if (get_func_rettype(funcoid) != INT4OID)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 				 errmsg("encoding conversion function %s must return type %s",
@@ -111,12 +112,23 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	 * string; the conversion function should throw an error if it can't
 	 * perform the requested conversion.
 	 */
-	OidFunctionCall5(funcoid,
-					 Int32GetDatum(from_encoding),
-					 Int32GetDatum(to_encoding),
-					 CStringGetDatum(""),
-					 CStringGetDatum(result),
-					 Int32GetDatum(0));
+	funcresult = OidFunctionCall6(funcoid,
+								  Int32GetDatum(from_encoding),
+								  Int32GetDatum(to_encoding),
+								  CStringGetDatum(""),
+								  CStringGetDatum(result),
+								  Int32GetDatum(0),
+								  BoolGetDatum(false));
+
+	/*
+	 * The function should return 0 for empty input. Might as well check that,
+	 * too.
+	 */
+	if (DatumGetInt32(funcresult) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("encoding conversion function %s returned incorrect result for empty input",
+						NameListToString(func_name))));
 
 	/*
 	 * All seem ok, go ahead (possible failure would be a duplicate conversion
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 80c2672461..762f77d533 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2280,6 +2280,8 @@ write_console(const char *line, int len)
 	 * Conversion on non-win32 platforms is not implemented yet. It requires
 	 * non-throw version of pg_do_encoding_conversion(), that converts
 	 * unconvertable characters to '?' without errors.
+	 *
+	 * XXX: We have a no-throw version now. It doesn't convert to '?' though.
 	 */
 #endif
 
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a07b54bd3b..33e9c9a9e3 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -25,15 +25,20 @@
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
-			const unsigned char *tab)
+			const unsigned char *tab,
+			bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -41,7 +46,11 @@ local2local(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -50,13 +59,19 @@ local2local(const unsigned char *l,
 			if (c2)
 				*p++ = c2;
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -66,18 +81,26 @@ local2local(const unsigned char *l,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = l;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
@@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
+			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 
@@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l,
 				*p++ = c2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l,
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;
 
@@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic,
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
@@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic,
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
@@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt,
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
+	const unsigned char *start = utf;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len,
 
 			l = pg_utf_mblen(utf);
 			if (len < l)
+			{
+				/* need more data to decide if this is a combined char */
+				utf -= l_save;
 				break;
+			}
 
 			if (!pg_utf8_islegal(utf, l))
+			{
+				if (!noError)
+					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+				utf -= l_save;
 				break;
+			}
 
 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
@@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* failed to translate this character */
+		utf -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(PG_UTF8, encoding,
-								   (const char *) (utf - l), len);
+								   (const char *) utf, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 
 	*iso = '\0';
+
+	return utf - start;
 }
 
 /*
@@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len,
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding,
+		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
+	const unsigned char *start = iso;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len,
 		}
 
 		/* failed to translate this character */
+		iso -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(encoding, PG_UTF8,
-								   (const char *) (iso - l), len);
+								   (const char *) iso, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);
 
 	*utf = '\0';
+
+	return iso - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
index 4c5b02654d..368c2deb5e 100644
--- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
@@ -44,8 +44,11 @@ PG_FUNCTION_INFO_V1(win866_to_iso);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -306,12 +309,14 @@ koi8r_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -320,12 +325,14 @@ mic_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_KOI8R);
 
-	mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -334,12 +341,14 @@ iso_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -348,12 +357,14 @@ mic_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_ISO_8859_5);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -362,12 +373,14 @@ win1251_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -376,12 +389,14 @@ mic_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1251);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -390,12 +405,14 @@ win866_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -404,12 +421,14 @@ mic_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN866);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -418,12 +437,14 @@ koi8r_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN1251);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -432,12 +453,14 @@ win1251_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -446,12 +469,14 @@ koi8r_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN866);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -460,12 +485,14 @@ win866_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi);
+	converted = local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -474,12 +501,14 @@ win866_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_WIN1251);
 
-	local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251);
+	converted = local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -488,12 +517,14 @@ win1251_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_WIN866);
 
-	local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -502,12 +533,14 @@ iso_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_KOI8R);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -516,12 +549,14 @@ koi8r_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -530,12 +565,14 @@ iso_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN1251);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -544,12 +581,14 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -558,12 +597,14 @@ iso_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN866);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -572,10 +613,12 @@ win866_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso);
+	converted = local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
index 4d7fb116cf..a3fd35bd40 100644
--- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
@@ -19,8 +19,8 @@ PG_MODULE_MAGIC;
 PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004);
 PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004);
 
-static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len);
-static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len);
+static int	euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError);
 
 /* ----------
  * conv_proc(
@@ -28,8 +28,11 @@ static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -39,12 +42,14 @@ euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004);
 
-	euc_jis_20042shift_jis_2004(src, dest, len);
+	converted = euc_jis_20042shift_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -53,20 +58,23 @@ shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004);
 
-	shift_jis_20042euc_jis_2004(src, dest, len);
+	converted = shift_jis_20042euc_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_JIS_2004 -> SHIFT_JIS_2004
  */
-static void
-euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				ku,
 				ten;
@@ -79,8 +87,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -90,8 +102,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
 
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		if (c1 == SS2 && l == 2)	/* JIS X 0201 kana? */
 		{
@@ -121,8 +137,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 						*p++ = (ku + 0x19b) >> 1;
 					}
 					else
+					{
+						if (noError)
+							break;
 						report_invalid_encoding(PG_EUC_JIS_2004,
 												(const char *) euc, len);
+					}
 			}
 
 			if (ku % 2)
@@ -132,8 +152,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
@@ -149,8 +173,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 			else if (ku >= 63 && ku <= 94)
 				*p++ = (ku + 0x181) >> 1;
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 
 			if (ku % 2)
 			{
@@ -159,20 +187,30 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		euc += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
@@ -212,9 +250,10 @@ get_ten(int b, int *ku)
  * SHIFT_JIS_2004 ---> EUC_JIS_2004
  */
 
-static void
-shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len)
+static int
+shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1;
 	int			ku,
 				ten,
@@ -230,8 +269,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -241,8 +284,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
 
 		if (l < 0 || l > len)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SHIFT_JIS_2004,
 									(const char *) sjis, len);
+		}
 
 		if (c1 >= 0xa1 && c1 <= 0xdf && l == 1)
 		{
@@ -266,8 +313,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x100;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xe0 && c1 <= 0xef)	/* plane 1 62ku-94ku */
@@ -275,9 +326,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x180;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
-
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xf0 && c1 <= 0xf3)	/* plane 2
@@ -286,8 +340,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				switch (c1)
 				{
 					case 0xf0:
@@ -309,16 +367,24 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				if (c1 == 0xf4 && kubun == 1)
 					ku = 15;
 				else
 					ku = (c1 << 1) - 0x19a - kubun;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 
 			if (plane == 2)
 				*p++ = SS3;
@@ -330,4 +396,6 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
index e9bb896935..09b3c2e75b 100644
--- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_cn_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_MULE_INTERNAL);
 
-	euc_cn2mic(src, dest, len);
+	converted = euc_cn2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_CN);
 
-	mic2euc_cn(src, dest, len);
+	converted = mic2euc_cn(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_CN ---> MIC
  */
-static void
-euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 
 	while (len > 0)
@@ -76,7 +84,11 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = LC_GB2312_80;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -86,21 +98,28 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_CN
  */
-static void
-mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
@@ -109,11 +128,19 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (c1 != LC_GB2312_80)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
 										   (const char *) mic, len);
+			}
 			if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			mic++;
 			*p++ = *mic++;
 			*p++ = *mic++;
@@ -122,12 +149,18 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
index 5059f917a9..2e68708893 100644
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -42,17 +42,20 @@ PG_FUNCTION_INFO_V1(mic_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static int	sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
+static int	mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -60,12 +63,14 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_SJIS);
 
-	euc_jp2sjis(src, dest, len);
+	converted = euc_jp2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -74,12 +79,14 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_EUC_JP);
 
-	sjis2euc_jp(src, dest, len);
+	converted = sjis2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -88,12 +95,14 @@ euc_jp_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_MULE_INTERNAL);
 
-	euc_jp2mic(src, dest, len);
+	converted = euc_jp2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -102,12 +111,14 @@ mic_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_JP);
 
-	mic2euc_jp(src, dest, len);
+	converted = mic2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -116,12 +127,14 @@ sjis_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_MULE_INTERNAL);
 
-	sjis2mic(src, dest, len);
+	converted = sjis2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -130,20 +143,23 @@ mic_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_SJIS);
 
-	mic2sjis(src, dest, len);
+	converted = mic2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * SJIS ---> MIC
  */
-static void
-sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -167,7 +183,11 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 			 * JIS X0208, X0212, user defined extended characters
 			 */
 			if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			c2 = sjis[1];
 			k = (c1 << 8) + c2;
 			if (k >= 0xed40 && k < 0xf040)
@@ -257,21 +277,28 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
 
 /*
  * MIC ---> SJIS
  */
-static void
-mic2sjis(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1,
 				c2,
 				k,
@@ -284,8 +311,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -293,8 +324,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 			*p++ = mic[1];
 		else if (c1 == LC_JISX0208)
@@ -350,20 +385,27 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 			}
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP ---> MIC
  */
-static void
-euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -374,8 +416,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -383,8 +429,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{						/* 1 byte kana? */
 			*p++ = LC_JISX0201K;
@@ -406,14 +456,17 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_JP
  */
-static void
-mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -424,8 +477,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -433,8 +490,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 		{
 			*p++ = SS2;
@@ -452,20 +513,27 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP -> SJIS
  */
-static void
-euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				c2,
 				k;
@@ -478,8 +546,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -487,8 +559,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{
 			/* hankaku kana? */
@@ -551,14 +627,17 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * SJIS ---> EUC_JP
  */
-static void
-sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -573,8 +652,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -582,8 +665,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SJIS,
 									(const char *) sjis, len);
+		}
 		if (c1 >= 0xa1 && c1 <= 0xdf)
 		{
 			/* JIS X0201 (1 byte kana) */
@@ -680,4 +767,6 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
index ac823d6c27..3b85f0c186 100644
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_kr_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_MULE_INTERNAL);
 
-	euc_kr2mic(src, dest, len);
+	converted = euc_kr2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_KR);
 
-	mic2euc_kr(src, dest, len);
+	converted = mic2euc_kr(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_KR ---> MIC
  */
-static void
-euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -78,8 +86,12 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = LC_KS5601;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -89,22 +101,29 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_KR
  */
-static void
-mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -115,8 +134,12 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -124,18 +147,28 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_KS5601)
 		{
 			*p++ = mic[1];
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
index 66c242d7f3..4bf8acda99 100644
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -32,17 +32,20 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
-static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
-static void big52mic(const unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
+static int	mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -50,12 +53,14 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
 
-	euc_tw2big5(src, dest, len);
+	converted = euc_tw2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -64,12 +69,14 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
 
-	big52euc_tw(src, dest, len);
+	converted = big52euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -78,12 +85,14 @@ euc_tw_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
 
-	euc_tw2mic(src, dest, len);
+	converted = euc_tw2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -92,12 +101,14 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
 
-	mic2euc_tw(src, dest, len);
+	converted = mic2euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -106,12 +117,14 @@ big5_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
 
-	big52mic(src, dest, len);
+	converted = big52mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -120,21 +133,24 @@ mic_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
 
-	mic2big5(src, dest, len);
+	converted = mic2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 
 /*
  * EUC_TW ---> Big5
  */
-static void
-euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	unsigned char c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -149,8 +165,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Verify and decode the next EUC_TW input character */
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -171,8 +191,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Write it out in Big5 */
 			big5buf = CNStoBIG5(cnsBuf, lc);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_EUC_TW, PG_BIG5,
 										   (const char *) euc, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 
@@ -182,22 +206,29 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * Big5 ---> EUC_TW
  */
-static void
-big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -212,8 +243,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			big5buf = (c1 << 8) | big5[1];
 			cnsBuf = BIG5toCNS(big5buf, &lc);
 
@@ -237,8 +272,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 				*p++ = cnsBuf & 0x00ff;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_BIG5, PG_EUC_TW,
 										   (const char *) big5, len);
+			}
 
 			big5 += l;
 			len -= l;
@@ -256,14 +295,17 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		}
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * EUC_TW ---> MIC
  */
-static void
-euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -274,8 +316,12 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -304,22 +350,29 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_TW
  */
-static void
-mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -330,8 +383,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -339,8 +396,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1)
 		{
 			*p++ = mic[1];
@@ -362,20 +423,27 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[3];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * Big5 ---> MIC
  */
-static void
-big52mic(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -389,8 +457,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			*p++ = c1;
 			big5++;
 			len--;
@@ -398,8 +470,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_BIG5,
 									(const char *) big5, len);
+		}
 		big5buf = (c1 << 8) | big5[1];
 		cnsBuf = BIG5toCNS(big5buf, &lc);
 		if (lc != 0)
@@ -412,20 +488,27 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 			*p++ = cnsBuf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
 									   (const char *) big5, len);
+		}
 		big5 += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * MIC ---> Big5
  */
-static void
-mic2big5(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -438,8 +521,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -447,8 +534,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
 		{
 			if (c1 == LCPRV2_B)
@@ -462,16 +553,26 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 			}
 			big5buf = CNStoBIG5(cnsBuf, c1);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 										   (const char *) mic, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
index 2e28e6780a..8610fcb69a 100644
--- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
+++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(win1250_to_latin2);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -82,12 +85,14 @@ latin2_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -96,12 +101,14 @@ mic_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN2);
 
-	mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -110,13 +117,15 @@ win1250_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 win1250_2_iso88592);
+	converted = latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -125,13 +134,15 @@ mic_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1250);
 
-	mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 iso88592_2_win1250);
+	converted = mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -140,12 +151,15 @@ latin2_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_WIN1250);
 
-	local2local(src, dest, len, PG_LATIN2, PG_WIN1250, iso88592_2_win1250);
+	converted = local2local(src, dest, len, PG_LATIN2, PG_WIN1250,
+							iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -154,10 +168,13 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_LATIN2);
 
-	local2local(src, dest, len, PG_WIN1250, PG_LATIN2, win1250_2_iso88592);
+	converted = local2local(src, dest, len, PG_WIN1250, PG_LATIN2,
+							win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
index bc651410f2..bff27d1c29 100644
--- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(mic_to_latin4);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -42,12 +45,14 @@ latin1_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,12 +61,14 @@ mic_to_latin1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN1);
 
-	mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -70,12 +77,14 @@ latin3_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN3, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -84,12 +93,14 @@ mic_to_latin3(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN3);
 
-	mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,12 +109,14 @@ latin4_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN4, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -112,10 +125,12 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN4);
 
-	mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
index d6067cdc24..3838b15cab 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ big5_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &big5_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = LocalToUtf(src, len, dest,
+						   &big5_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5);
 
-	UtfToLocal(src, len, dest,
-			   &big5_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = UtfToLocal(src, len, dest,
+						   &big5_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
index ed90e8e682..75719fe5f1 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
@@ -33,8 +33,11 @@ PG_FUNCTION_INFO_V1(koi8u_to_utf8);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -44,16 +47,19 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R);
 
-	UtfToLocal(src, len, dest,
-			   &koi8r_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8r_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -62,16 +68,19 @@ koi8r_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8r_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8r_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -80,16 +89,19 @@ utf8_to_koi8u(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U);
 
-	UtfToLocal(src, len, dest,
-			   &koi8u_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8u_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,14 +110,17 @@ koi8u_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8u_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8u_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
index d699affce4..5391001951 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jis_2004_to_unicode_tree,
-			   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jis_2004_to_unicode_tree,
+						   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jis_2004_from_unicode_tree,
-			   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jis_2004_from_unicode_tree,
+						   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
index d7c0ba6a58..c87d1bf239 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_cn_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_cn_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN);
 
-	UtfToLocal(src, len, dest,
-			   &euc_cn_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_cn_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
index 13a3a23e77..6a55134db2 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jp);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jp_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jp_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jp_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jp_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
index 1bbb8aaef7..fe1924e2fe 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_kr_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_kr_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR);
 
-	UtfToLocal(src, len, dest,
-			   &euc_kr_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_kr_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
index 9830045dcc..68215659b5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_tw);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_tw_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_tw_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW);
 
-	UtfToLocal(src, len, dest,
-			   &euc_tw_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_tw_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index f86ecf2742..e1a59c39a4 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -183,8 +183,11 @@ conv_utf8_to_18030(uint32 code)
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -193,16 +196,19 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gb18030_to_unicode_tree,
-			   NULL, 0,
-			   conv_18030_to_utf8,
-			   PG_GB18030);
+	converted = LocalToUtf(src, len, dest,
+						   &gb18030_to_unicode_tree,
+						   NULL, 0,
+						   conv_18030_to_utf8,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -211,14 +217,17 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, len, dest,
-			   &gb18030_from_unicode_tree,
-			   NULL, 0,
-			   conv_utf8_to_18030,
-			   PG_GB18030);
+	converted = UtfToLocal(src, len, dest,
+						   &gb18030_from_unicode_tree,
+						   NULL, 0,
+						   conv_utf8_to_18030,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
index 2ab8b16c8a..881386d534 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_gbk);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ gbk_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gbk_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = LocalToUtf(src, len, dest,
+						   &gbk_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK);
 
-	UtfToLocal(src, len, dest,
-			   &gbk_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = UtfToLocal(src, len, dest,
+						   &gbk_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
index 3e49f67ea2..d93a521bad 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
@@ -52,8 +52,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -100,6 +103,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -108,12 +112,15 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -122,7 +129,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -132,6 +139,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -140,12 +148,15 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -154,5 +165,5 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
index 67e713cca1..d0dc4cca37 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
@@ -26,8 +26,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -37,6 +40,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
@@ -45,7 +50,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+		}
 		if (!IS_HIGHBIT_SET(c))
 			*dest++ = c;
 		else
@@ -58,7 +67,7 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
 
 Datum
@@ -67,6 +76,8 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c,
 				c1;
 
@@ -76,7 +87,11 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_UTF8, (const char *) src, len);
+		}
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(c))
 		{
@@ -89,10 +104,18 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 			int			l = pg_utf_mblen(src);
 
 			if (l > len || !pg_utf8_islegal(src, l))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_UTF8, (const char *) src, len);
+			}
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 			c1 = src[1] & 0x3f;
 			c = ((c & 0x1f) << 6) | c1;
 			if (c >= 0x80 && c <= 0xff)
@@ -102,11 +125,15 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 				len -= 2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 		}
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
index 578f5df4e7..317daa2d5e 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_johab);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ johab_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &johab_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = LocalToUtf(src, len, dest,
+						   &johab_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_johab(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB);
 
-	UtfToLocal(src, len, dest,
-			   &johab_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = UtfToLocal(src, len, dest,
+						   &johab_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
index dd9fc2975a..4c9348aba5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ sjis_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &sjis_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = LocalToUtf(src, len, dest,
+						   &sjis_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS);
 
-	UtfToLocal(src, len, dest,
-			   &sjis_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = UtfToLocal(src, len, dest,
+						   &sjis_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
index 4bcc886d67..1fffdc5930 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_shift_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &shift_jis_2004_to_unicode_tree,
-			   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &shift_jis_2004_to_unicode_tree,
+						   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &shift_jis_2004_from_unicode_tree,
-			   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &shift_jis_2004_from_unicode_tree,
+						   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
index c8e512994a..d9471dad09 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_uhc);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ uhc_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &uhc_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = LocalToUtf(src, len, dest,
+						   &uhc_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC);
 
-	UtfToLocal(src, len, dest,
-			   &uhc_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = UtfToLocal(src, len, dest,
+						   &uhc_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
index 0c9493dee5..110ba5677d 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
@@ -48,8 +48,11 @@ PG_FUNCTION_INFO_V1(utf8_to_win);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -81,6 +84,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -89,12 +93,15 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -103,7 +110,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -113,6 +120,7 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -121,12 +129,15 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -135,5 +146,5 @@ utf8_to_win(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 2578573b0a..877d57eee5 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	OidFunctionCall5(proc,
-					 Int32GetDatum(src_encoding),
-					 Int32GetDatum(dest_encoding),
-					 CStringGetDatum(src),
-					 CStringGetDatum(result),
-					 Int32GetDatum(len));
+	(void) OidFunctionCall6(proc,
+							Int32GetDatum(src_encoding),
+							Int32GetDatum(dest_encoding),
+							CStringGetDatum(src),
+							CStringGetDatum(result),
+							Int32GetDatum(len),
+							BoolGetDatum(false));
 
 	/*
 	 * If the result is large, it's worth repalloc'ing to release any extra
@@ -435,6 +436,59 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 	return result;
 }
 
+/*
+ * Convert src string to another encoding.
+ *
+ * This function has a different API than the other conversion functions.
+ * The caller should've looked up the conversion function using
+ * FindDefaultConversionProc(). Unlike the other functions, the converted
+ * result is not palloc'd. It is written to a caller-supplied buffer instead.
+ *
+ * src_encoding   - encoding to convert from
+ * dest_encoding  - encoding to convert to
+ * src, srclen    - input buffer and its length in bytes
+ * dest, destlen  - destination buffer and its size in bytes
+ *
+ * The output is null-terminated.
+ *
+ * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
+ * wouldn't necessarily fit in the output buffer, and the function will not
+ * convert the whole input.
+ *
+ * TODO: It would be nice to also return the number of bytes written to the
+ * caller, to avoid a call to strlen().
+ */
+int
+pg_do_encoding_conversion_buf(Oid proc,
+							  int src_encoding,
+							  int dest_encoding,
+							  unsigned char *src, int srclen,
+							  unsigned char *dest, int destlen,
+							  bool noError)
+{
+	Datum		result;
+
+	/*
+	 * If the destination buffer is not large enough to hold the result in the
+	 * worst case, limit the input size passed to the conversion function.
+	 *
+	 * TODO: It would perhaps be more efficient to pass the destination buffer
+	 * size to the conversion function, so that if the conversion expands less
+	 * than the worst case, it could continue to fill up the whole buffer.
+	 */
+	if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
+		srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
+
+	result = OidFunctionCall6(proc,
+							  Int32GetDatum(src_encoding),
+							  Int32GetDatum(dest_encoding),
+							  CStringGetDatum(src),
+							  CStringGetDatum(dest),
+							  Int32GetDatum(srclen),
+							  BoolGetDatum(noError));
+	return DatumGetInt32(result);
+}
+
 /*
  * Convert string to encoding encoding_name. The source
  * encoding is the DB encoding.
@@ -762,12 +816,13 @@ perform_default_encoding_conversion(const char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	FunctionCall5(flinfo,
+	FunctionCall6(flinfo,
 				  Int32GetDatum(src_encoding),
 				  Int32GetDatum(dest_encoding),
 				  CStringGetDatum(src),
 				  CStringGetDatum(result),
-				  Int32GetDatum(len));
+				  Int32GetDatum(len),
+				  BoolGetDatum(false));
 
 	/*
 	 * Release extra space if there might be a lot --- see comments in
@@ -849,12 +904,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
 	c_as_utf8[c_as_utf8_len] = '\0';
 
 	/* Convert, or throw error if we can't */
-	FunctionCall5(Utf8ToServerConvProc,
+	FunctionCall6(Utf8ToServerConvProc,
 				  Int32GetDatum(PG_UTF8),
 				  Int32GetDatum(server_encoding),
 				  CStringGetDatum(c_as_utf8),
 				  CStringGetDatum(s),
-				  Int32GetDatum(c_as_utf8_len));
+				  Int32GetDatum(c_as_utf8_len),
+				  BoolGetDatum(false));
 }
 
 
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 43fc297eb6..ee6be95b08 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -28,6 +28,7 @@ static void check_for_reg_data_type_usage(ClusterInfo *cluster);
 static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(ClusterInfo *new_cluster);
+static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
 static char *get_canonical_locale_name(int category, const char *locale);
 
 
@@ -102,6 +103,15 @@ check_and_dump_old_cluster(bool live_check)
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
 
+	/*
+	 * PG 14 changed the function signature of encoding conversion functions.
+	 * Conversions from older versions cannot be upgraded automatically
+	 * because the user-defined functions used by the encoding conversions
+	 * need to changed to match the new signature.
+	 */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1300)
+		check_for_user_defined_encoding_conversions(&old_cluster);
+
 	/*
 	 * Pre-PG 14 allowed user defined postfix operators, which are not
 	 * supported anymore.  Verify there are none, iff applicable.
@@ -1268,6 +1278,91 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
 	check_ok();
 }
 
+/*
+ * Verify that no user-defined encoding conversions exist.
+ */
+static void
+check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
+{
+	int			dbnum;
+	FILE	   *script = NULL;
+	bool		found = false;
+	char		output_path[MAXPGPATH];
+
+	prep_status("Checking for user-defined encoding conversions");
+
+	snprintf(output_path, sizeof(output_path),
+			 "encoding_conversions.txt");
+
+	/* Find any user defined encoding conversions */
+	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
+	{
+		PGresult   *res;
+		bool		db_used = false;
+		int			ntups;
+		int			rowno;
+		int			i_conoid,
+					i_conname,
+					i_nspname;
+		DbInfo	   *active_db = &cluster->dbarr.dbs[dbnum];
+		PGconn	   *conn = connectToServer(cluster, active_db->db_name);
+
+		/*
+		 * The query below hardcodes FirstNormalObjectId as 16384 rather than
+		 * interpolating that C #define into the query because, if that
+		 * #define is ever changed, the cutoff we want to use is the value
+		 * used by pre-version 14 servers, not that of some future version.
+		 */
+		res = executeQueryOrDie(conn,
+								"SELECT c.oid as conoid, c.conname, n.nspname "
+								"FROM pg_catalog.pg_conversion c, "
+								"     pg_catalog.pg_namespace n "
+								"WHERE c.connamespace = n.oid AND "
+								"      c.oid >= 16384");
+		ntups = PQntuples(res);
+		i_conoid = PQfnumber(res, "conoid");
+		i_conname = PQfnumber(res, "conname");
+		i_nspname = PQfnumber(res, "nspname");
+		for (rowno = 0; rowno < ntups; rowno++)
+		{
+			found = true;
+			if (script == NULL &&
+				(script = fopen_priv(output_path, "w")) == NULL)
+				pg_fatal("could not open file \"%s\": %s\n",
+						 output_path, strerror(errno));
+			if (!db_used)
+			{
+				fprintf(script, "In database: %s\n", active_db->db_name);
+				db_used = true;
+			}
+			fprintf(script, "  (oid=%s) %s.%s\n",
+					PQgetvalue(res, rowno, i_conoid),
+					PQgetvalue(res, rowno, i_nspname),
+					PQgetvalue(res, rowno, i_conname));
+		}
+
+		PQclear(res);
+
+		PQfinish(conn);
+	}
+
+	if (script)
+		fclose(script);
+
+	if (found)
+	{
+		pg_log(PG_REPORT, "fatal\n");
+		pg_fatal("Your installation contains user-defined encoding conversions.\n"
+				 "The conversion function parameters changed in PostgreSQL version 14\n"
+				 "so this cluster cannot currently be upgraded.  You can remove the\n"
+				 "encoding conversions in the old cluster and restart the upgrade.\n"
+				 "A list of user-defined encoding conversions is in the file:\n"
+				 "    %s\n\n", output_path);
+	}
+	else
+		check_ok();
+}
+
 
 /*
  * get_canonical_locale_name
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4e0c9be58c..12e228c7e0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10774,388 +10774,388 @@
 # conversion functions
 { oid => '4302',
   descr => 'internal conversion function for KOI8R to MULE_INTERNAL',
-  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_mic',
+  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4303',
   descr => 'internal conversion function for MULE_INTERNAL to KOI8R',
-  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_koi8r',
+  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4304',
   descr => 'internal conversion function for ISO-8859-5 to MULE_INTERNAL',
-  proname => 'iso_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_mic',
+  proname => 'iso_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4305',
   descr => 'internal conversion function for MULE_INTERNAL to ISO-8859-5',
-  proname => 'mic_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_iso',
+  proname => 'mic_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4306',
   descr => 'internal conversion function for WIN1251 to MULE_INTERNAL',
-  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_mic',
+  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4307',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1251',
-  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1251',
+  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4308',
   descr => 'internal conversion function for WIN866 to MULE_INTERNAL',
-  proname => 'win866_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_mic',
+  proname => 'win866_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4309',
   descr => 'internal conversion function for MULE_INTERNAL to WIN866',
-  proname => 'mic_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win866',
+  proname => 'mic_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4310', descr => 'internal conversion function for KOI8R to WIN1251',
-  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'koi8r_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4311', descr => 'internal conversion function for WIN1251 to KOI8R',
-  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_koi8r', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4312', descr => 'internal conversion function for KOI8R to WIN866',
-  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_win866',
+  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4313', descr => 'internal conversion function for WIN866 to KOI8R',
-  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_koi8r',
+  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4314',
   descr => 'internal conversion function for WIN866 to WIN1251',
-  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win866_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4315',
   descr => 'internal conversion function for WIN1251 to WIN866',
-  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_win866', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4316',
   descr => 'internal conversion function for ISO-8859-5 to KOI8R',
-  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_koi8r',
+  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4317',
   descr => 'internal conversion function for KOI8R to ISO-8859-5',
-  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_iso',
+  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4318',
   descr => 'internal conversion function for ISO-8859-5 to WIN1251',
-  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win1251',
+  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4319',
   descr => 'internal conversion function for WIN1251 to ISO-8859-5',
-  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_iso',
+  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4320',
   descr => 'internal conversion function for ISO-8859-5 to WIN866',
-  proname => 'iso_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win866',
+  proname => 'iso_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4321',
   descr => 'internal conversion function for WIN866 to ISO-8859-5',
-  proname => 'win866_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_iso',
+  proname => 'win866_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4322',
   descr => 'internal conversion function for EUC_CN to MULE_INTERNAL',
-  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_mic',
+  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_mic',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4323',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_CN',
-  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_cn',
+  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_cn',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4324', descr => 'internal conversion function for EUC_JP to SJIS',
-  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_sjis',
+  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4325', descr => 'internal conversion function for SJIS to EUC_JP',
-  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_euc_jp',
+  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4326',
   descr => 'internal conversion function for EUC_JP to MULE_INTERNAL',
-  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_mic',
+  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4327',
   descr => 'internal conversion function for SJIS to MULE_INTERNAL',
-  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_mic',
+  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4328',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_JP',
-  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_jp',
+  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4329',
   descr => 'internal conversion function for MULE_INTERNAL to SJIS',
-  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_sjis',
+  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4330',
   descr => 'internal conversion function for EUC_KR to MULE_INTERNAL',
-  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_mic',
+  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_mic',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4331',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_KR',
-  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_kr',
+  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_kr',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4332', descr => 'internal conversion function for EUC_TW to BIG5',
-  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_big5',
+  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4333', descr => 'internal conversion function for BIG5 to EUC_TW',
-  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_euc_tw',
+  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4334',
   descr => 'internal conversion function for EUC_TW to MULE_INTERNAL',
-  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_mic',
+  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4335',
   descr => 'internal conversion function for BIG5 to MULE_INTERNAL',
-  proname => 'big5_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_mic',
+  proname => 'big5_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4336',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_TW',
-  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_tw',
+  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4337',
   descr => 'internal conversion function for MULE_INTERNAL to BIG5',
-  proname => 'mic_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_big5',
+  proname => 'mic_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4338',
   descr => 'internal conversion function for LATIN2 to MULE_INTERNAL',
-  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin2_to_mic',
+  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4339',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN2',
-  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin2',
+  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin2',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4340',
   descr => 'internal conversion function for WIN1250 to MULE_INTERNAL',
-  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1250_to_mic',
+  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4341',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1250',
-  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1250',
+  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1250',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4342',
   descr => 'internal conversion function for LATIN2 to WIN1250',
-  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'latin2_to_win1250', probin => '$libdir/latin2_and_win1250' },
 { oid => '4343',
   descr => 'internal conversion function for WIN1250 to LATIN2',
-  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1250_to_latin2', probin => '$libdir/latin2_and_win1250' },
 { oid => '4344',
   descr => 'internal conversion function for LATIN1 to MULE_INTERNAL',
-  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin1_to_mic',
+  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin1_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4345',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN1',
-  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin1',
+  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin1',
   probin => '$libdir/latin_and_mic' },
 { oid => '4346',
   descr => 'internal conversion function for LATIN3 to MULE_INTERNAL',
-  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin3_to_mic',
+  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin3_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4347',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN3',
-  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin3',
+  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin3',
   probin => '$libdir/latin_and_mic' },
 { oid => '4348',
   descr => 'internal conversion function for LATIN4 to MULE_INTERNAL',
-  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin4_to_mic',
+  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin4_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4349',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN4',
-  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin4',
+  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin4',
   probin => '$libdir/latin_and_mic' },
 { oid => '4352', descr => 'internal conversion function for BIG5 to UTF8',
-  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_utf8',
+  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_utf8',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4353', descr => 'internal conversion function for UTF8 to BIG5',
-  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_big5',
+  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_big5',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4354', descr => 'internal conversion function for UTF8 to KOI8R',
-  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8r',
+  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8r',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4355', descr => 'internal conversion function for KOI8R to UTF8',
-  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_utf8',
+  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4356', descr => 'internal conversion function for UTF8 to KOI8U',
-  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8u',
+  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8u',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4357', descr => 'internal conversion function for KOI8U to UTF8',
-  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8u_to_utf8',
+  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8u_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4358', descr => 'internal conversion function for UTF8 to WIN',
-  proname => 'utf8_to_win', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_win',
+  proname => 'utf8_to_win', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_win',
   probin => '$libdir/utf8_and_win' },
 { oid => '4359', descr => 'internal conversion function for WIN to UTF8',
-  proname => 'win_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win_to_utf8',
+  proname => 'win_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win_to_utf8',
   probin => '$libdir/utf8_and_win' },
 { oid => '4360', descr => 'internal conversion function for EUC_CN to UTF8',
-  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_utf8',
+  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_utf8',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4361', descr => 'internal conversion function for UTF8 to EUC_CN',
-  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_cn',
+  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_cn',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4362', descr => 'internal conversion function for EUC_JP to UTF8',
-  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_utf8',
+  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_utf8',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4363', descr => 'internal conversion function for UTF8 to EUC_JP',
-  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_jp',
+  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jp',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4364', descr => 'internal conversion function for EUC_KR to UTF8',
-  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_utf8',
+  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_utf8',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4365', descr => 'internal conversion function for UTF8 to EUC_KR',
-  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_kr',
+  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_kr',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4366', descr => 'internal conversion function for EUC_TW to UTF8',
-  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_utf8',
+  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_utf8',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4367', descr => 'internal conversion function for UTF8 to EUC_TW',
-  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_tw',
+  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_tw',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4368', descr => 'internal conversion function for GB18030 to UTF8',
-  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gb18030_to_utf8',
+  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gb18030_to_utf8',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4369', descr => 'internal conversion function for UTF8 to GB18030',
-  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gb18030',
+  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gb18030',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4370', descr => 'internal conversion function for GBK to UTF8',
-  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gbk_to_utf8',
+  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gbk_to_utf8',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4371', descr => 'internal conversion function for UTF8 to GBK',
-  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gbk',
+  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gbk',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4372',
   descr => 'internal conversion function for UTF8 to ISO-8859 2-16',
-  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_iso8859',
+  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4373',
   descr => 'internal conversion function for ISO-8859 2-16 to UTF8',
-  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso8859_to_utf8',
+  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_to_utf8',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4374', descr => 'internal conversion function for LATIN1 to UTF8',
-  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'iso8859_1_to_utf8', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4375', descr => 'internal conversion function for UTF8 to LATIN1',
-  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_iso8859_1', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4376', descr => 'internal conversion function for JOHAB to UTF8',
-  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'johab_to_utf8',
+  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'johab_to_utf8',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4377', descr => 'internal conversion function for UTF8 to JOHAB',
-  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_johab',
+  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_johab',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4378', descr => 'internal conversion function for SJIS to UTF8',
-  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_utf8',
+  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_utf8',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4379', descr => 'internal conversion function for UTF8 to SJIS',
-  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_sjis',
+  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_sjis',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4380', descr => 'internal conversion function for UHC to UTF8',
-  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'uhc_to_utf8',
+  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'uhc_to_utf8',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4381', descr => 'internal conversion function for UTF8 to UHC',
-  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_uhc',
+  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_uhc',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4382',
   descr => 'internal conversion function for EUC_JIS_2004 to UTF8',
-  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_utf8', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4383',
   descr => 'internal conversion function for UTF8 to EUC_JIS_2004',
-  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_euc_jis_2004', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4384',
   descr => 'internal conversion function for SHIFT_JIS_2004 to UTF8',
-  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_utf8', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4385',
   descr => 'internal conversion function for UTF8 to SHIFT_JIS_2004',
-  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_shift_jis_2004', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4386',
   descr => 'internal conversion function for EUC_JIS_2004 to SHIFT_JIS_2004',
   proname => 'euc_jis_2004_to_shift_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_shift_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 { oid => '4387',
   descr => 'internal conversion function for SHIFT_JIS_2004 to EUC_JIS_2004',
   proname => 'shift_jis_2004_to_euc_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_euc_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 64b22e4b0d..bbce9071df 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -616,6 +616,12 @@ extern int	pg_bind_textdomain_codeset(const char *domainname);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 												int src_encoding,
 												int dest_encoding);
+extern int	pg_do_encoding_conversion_buf(Oid proc,
+										  int src_encoding,
+										  int dest_encoding,
+										  unsigned char *src, int srclen,
+										  unsigned char *dst, int dstlen,
+										  bool noError);
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
@@ -627,18 +633,18 @@ extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void UtfToLocal(const unsigned char *utf, int len,
+extern int	UtfToLocal(const unsigned char *utf, int len,
 					   unsigned char *iso,
 					   const pg_mb_radix_tree *map,
 					   const pg_utf_to_local_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
-extern void LocalToUtf(const unsigned char *iso, int len,
+					   int encoding, bool noError);
+extern int	LocalToUtf(const unsigned char *iso, int len,
 					   unsigned char *utf,
 					   const pg_mb_radix_tree *map,
 					   const pg_local_to_utf_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
+					   int encoding, bool noError);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
@@ -656,18 +662,19 @@ extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg
 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
 									   const char *mbstr, int len) pg_attribute_noreturn();
 
-extern void local2local(const unsigned char *l, unsigned char *p, int len,
-						int src_encoding, int dest_encoding, const unsigned char *tab);
-extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+extern int	local2local(const unsigned char *l, unsigned char *p, int len,
+						int src_encoding, int dest_encoding, const unsigned char *tab,
+						bool noError);
+extern int	latin2mic(const unsigned char *l, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	mic2latin(const unsigned char *mic, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	latin2mic_with_table(const unsigned char *l, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
-extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+								 const unsigned char *tab, bool noError);
+extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
+								 const unsigned char *tab, bool noError);
 
 #ifdef WIN32
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 62c1067168..e34ab20974 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -37,3 +37,522 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+--
+-- Test built-in conversion functions.
+--
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+                     description                      |        result        |   errorat    |                           error                           
+------------------------------------------------------+----------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382ab             | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
+ invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+                     description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
+------------------------------------------------------+----------------------+----------------+----------------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f       |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xa9daa9ec     |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0 |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fbedd   |                      | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5f7         |                      | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab         |                      | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x             | \xe382abe382         | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x             | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x       | \xc3a4c3b6           | character with byte sequence 0xc3 0xa4 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, cyrillic                                      | \xd184d0bed0be       | \xc6cfcf |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+                     description                      |       inbytes        |           result           |   errorat    |                           error                           
+------------------------------------------------------+----------------------+----------------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f                   |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x81308a3181308b32         |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0             |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fcff3               |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5ab8139a732             |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab                     |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \xa5ab                     | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x8334e5398238c4338330b335 |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                            error                             
+---------------------------------------+----------------+--------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fbedd |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xa5f7       |          | 
+ incomplete char                       | \xbeddbe       | \xbedd       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f     | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f     | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x           | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |     result     | errorat  |                            error                             
+---------------------------------------+----------------+----------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f       |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fe8b1a1 |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xe382abe3829a |          | 
+ incomplete char                       | \xbeddbe       | \xe8b1a1       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f       | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f       | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x             | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6f8fdb |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6f81c0 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6f82f5 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6f8fdb | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6f8fdb | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+              description              |    inbytes     |        result        | errorat  |                             error                              
+---------------------------------------+----------------+----------------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f             |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fe8b1a1       |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fe28a84       |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fe3818be3829a |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fe8b1a1       | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f             | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f             | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f             | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fe8b1a1       | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fbedd |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fa2c2 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fa4f7 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fbedd | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+                  description                   |      inbytes       |      result      |   errorat    |                               error                               
+------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fcff3     |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f8431a530 |              | 
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+                  description                   |      inbytes       |     result     |   errorat    |                                                    error                                                    
+------------------------------------------------+--------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f       |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fe8b1a1 |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f       | \x8431a530   | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8"
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xe4dede |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xe4     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xe4dede | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \xd184d0bed0be |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xd184         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xd184d0bed0be | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xc6cfcf |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xc6     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xc6cfcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \x8bc68bcf8bcf |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \x8bc6         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \x8bc68bcf8bcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+          description           |    inbytes     |    result    | errorat  |                        error                         
+--------------------------------+----------------+--------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f     |          | 
+ valid                          | \x666f6fb648   | \x666f6fb648 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6fa27f |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f     | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fb648 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                                             error                                              
+--------------------------------+----------------+----------------+----------+------------------------------------------------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6fe8b1a1 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f       | \xa27f   | character with byte sequence 0xa2 0x7f in encoding "BIG5" has no equivalent in encoding "UTF8"
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                        error                         
+--------------------------------+----------------+----------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6f95e2af |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f95a3c1 |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6f95e2af | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+        description        |     inbytes      |     result     |     errorat      |                               error                                
+---------------------------+------------------+----------------+------------------+--------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f       |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x8bc68bcf8bcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \x8bc68bcf     | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x92bedd       |                  | 
+ invalid, incomplete char) | \x92be           | \x             | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f95a3c1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f       | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x             | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x92bedd       | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x             | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                     
+---------------------------+------------------+----------+------------------+---------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xc6cfcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xc6cf   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                       error                                                        
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xe4dede |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xe4de   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                    error                                                     
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x8fdb   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x8fdb   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+        description        |     inbytes      |    result    |     errorat      |                                                    error                                                     
+---------------------------+------------------+--------------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f     |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x           | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x           | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x           | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, incomplete char) | \x92be           | \x           | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6fa2a1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f     | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x           | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x           | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x           | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                      
+---------------------------+------------------+----------+------------------+----------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ valid (in SHIFT_JIS)      | \x92bedd         | \xbedd   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \xbedd   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 254ca06d3d..23ba60e395 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1052,13 +1052,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
  oid | proname | oid | conname 
 -----+---------+-----+---------
 (0 rows)
diff --git a/src/test/regress/input/create_function_1.source b/src/test/regress/input/create_function_1.source
index 412e339fcf..6ba37fe63b 100644
--- a/src/test/regress/input/create_function_1.source
+++ b/src/test/regress/input/create_function_1.source
@@ -78,6 +78,10 @@ CREATE FUNCTION test_opclass_options_func(internal)
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
 
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
+
 -- Things that shouldn't work:
 
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
diff --git a/src/test/regress/output/create_function_1.source b/src/test/regress/output/create_function_1.source
index 4d78fa1228..cb38a039bf 100644
--- a/src/test/regress/output/create_function_1.source
+++ b/src/test/regress/output/create_function_1.source
@@ -68,6 +68,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
     RETURNS void
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
 -- Things that shouldn't work:
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
     AS 'SELECT ''not an integer'';';
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 32ab9ed6b5..1990cbb6a1 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -23,12 +23,15 @@
 #include "access/htup_details.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
 #include "commands/sequence.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "executor/spi.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
@@ -1060,3 +1063,134 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_NULL();
 }
+
+/*
+ * Call an encoding conversion or verification function.
+ *
+ * Arguments:
+ *	string	  bytea -- string to convert
+ *	src_enc	  name  -- source encoding
+ *	dest_enc  name  -- destination encoding
+ *	noError	  bool  -- if set, don't ereport() on invalid or untranslatable
+ *					   input
+ *
+ * Result is a tuple with two attributes:
+ *  int4	-- number of input bytes successfully converted
+ *  bytea	-- converted string
+ */
+PG_FUNCTION_INFO_V1(test_enc_conversion);
+Datum
+test_enc_conversion(PG_FUNCTION_ARGS)
+{
+	bytea	   *string = PG_GETARG_BYTEA_PP(0);
+	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+	int			src_encoding = pg_char_to_encoding(src_encoding_name);
+	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
+	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
+	bool		noError = PG_GETARG_BOOL(3);
+	TupleDesc	tupdesc;
+	char	   *src;
+	char	   *dst;
+	bytea	   *retval;
+	Size		srclen;
+	Size		dstsize;
+	Oid			proc;
+	int			convertedbytes;
+	int			dstlen;
+	Datum		values[2];
+	bool		nulls[2];
+	HeapTuple	tuple;
+
+	if (src_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid source encoding name \"%s\"",
+						src_encoding_name)));
+	if (dest_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid destination encoding name \"%s\"",
+						dest_encoding_name)));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	srclen = VARSIZE_ANY_EXHDR(string);
+	src = VARDATA_ANY(string);
+
+	if (src_encoding == dest_encoding)
+	{
+		/* just check that the source string is valid */
+		int			oklen;
+
+		oklen = pg_encoding_verifymbstr(src_encoding, src, srclen);
+
+		if (oklen == srclen)
+		{
+			convertedbytes = oklen;
+			retval = string;
+		}
+		else if (!noError)
+		{
+			report_invalid_encoding(src_encoding, src + oklen, srclen - oklen);
+		}
+		else
+		{
+			/*
+			 * build bytea data type structure.
+			 */
+			Assert(oklen < srclen);
+			convertedbytes = oklen;
+			retval = (bytea *) palloc(oklen + VARHDRSZ);
+			SET_VARSIZE(retval, oklen + VARHDRSZ);
+			memcpy(VARDATA(retval), src, oklen);
+		}
+	}
+	else
+	{
+		proc = FindDefaultConversionProc(src_encoding, dest_encoding);
+		if (!OidIsValid(proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
+							pg_encoding_to_char(src_encoding),
+							pg_encoding_to_char(dest_encoding))));
+
+		if (srclen >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of memory"),
+					 errdetail("String of %d bytes is too long for encoding conversion.",
+							   (int) srclen)));
+
+		dstsize = (Size) srclen * MAX_CONVERSION_GROWTH + 1;
+		dst = MemoryContextAlloc(CurrentMemoryContext, dstsize);
+
+		/* perform conversion */
+		convertedbytes = pg_do_encoding_conversion_buf(proc,
+													   src_encoding,
+													   dest_encoding,
+													   (unsigned char *) src, srclen,
+													   (unsigned char *) dst, dstsize,
+													   noError);
+		dstlen = strlen(dst);
+
+		/*
+		 * build bytea data type structure.
+		 */
+		retval = (bytea *) palloc(dstlen + VARHDRSZ);
+		SET_VARSIZE(retval, dstlen + VARHDRSZ);
+		memcpy(VARDATA(retval), dst, dstlen);
+
+		pfree(dst);
+	}
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = Int32GetDatum(convertedbytes);
+	values[1] = PointerGetDatum(retval);
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
+}
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 02cf39f1ce..ea85f20ed8 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -34,3 +34,188 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+
+--
+-- Test built-in conversion functions.
+--
+
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+
+
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index bbd3834b63..0469174598 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -556,13 +556,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
 
 -- Check for conprocs that don't perform the specific conversion that
 -- pg_conversion alleges they do, by trying to invoke each conversion
-- 
2.22.0

v6-0002-Use-SSE-4-for-verifying-UTF-8-text.patchapplication/octet-stream; name=v6-0002-Use-SSE-4-for-verifying-UTF-8-text.patchDownload

From 55965a89add6b9927bb77486cd8833fcd502e903 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 11:37:43 -0400
Subject: [PATCH v6 2/4] Use SSE 4 for verifying UTF-8 text.

Replace pg_utf8_verifystr() with two faster implementations:

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.

On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 116 +++--
 configure.ac                             |  63 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  27 +-
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  51 +++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 120 +++++
 src/port/pg_utf8_sse42.c                 | 537 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  69 +++
 src/test/regress/expected/conversion.out |  52 +++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1049 insertions(+), 67 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index ce9ea36999..9b66a9f2c0 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17670,14 +17671,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17691,32 +17692,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17730,26 +17734,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -17884,12 +17891,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -17903,7 +17910,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -17956,6 +17963,63 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 07da84d401..4eb4042ea0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2017,14 +2017,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2065,12 +2065,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2084,7 +2084,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2121,6 +2121,53 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 55cab4d2bf..303dae4441 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -905,6 +905,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..636c637706
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..85a6bbd0eb
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* ASCII */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..d5ee029180
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The intesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_THREE_BYTE_LEAD 0xEF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_CONTINUATION 0xBF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share one value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+/* helper functions to wrap intrinsics */
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/*
+ * Shift entire "input" register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * "prev" register. Can be stated in C thusly:
+ *
+ * (prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	_mm_setr_epi8(0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF,
+				  MAX_THREE_BYTE_LEAD,
+				  MAX_TWO_BYTE_LEAD,
+				  MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = _mm_setr_epi8(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = _mm_setr_epi8(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = _mm_setr_epi8(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero values in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. If the current chunk is ASCII, we don't need
+		 * to update prev_incomplete since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		unsigned char inbuf[sizeof(__m128i)];
+
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..263b840150
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 49614106dc..a92350d009 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -112,10 +112,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 2aa062b2c9..236c0a857b 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -489,6 +489,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => undef,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.22.0

v6-0003-Add-an-ASCII-fast-path-to-the-fallback-UTF-8-vali.patchapplication/octet-stream; name=v6-0003-Add-an-ASCII-fast-path-to-the-fallback-UTF-8-vali.patchDownload

From f244e7d52113efd392fcfe97348656a3939b8754 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 11:39:42 -0400
Subject: [PATCH v6 3/4] Add an ASCII fast path to the fallback UTF-8
 validator.

Using bitwise operations, we can check an entire 8-byte
chunk for both valid ASCII and zero bytes.
---
 src/include/port/pg_utf8.h  | 33 +++++++++++++++++++++++++++++++++
 src/port/pg_utf8_fallback.c | 11 ++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index 636c637706..a19fc55c1e 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -48,4 +48,37 @@ extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
 #define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
 #define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
 
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(chunk))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (chunk & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 #endif							/* PG_UTF8_H */
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 85a6bbd0eb..9a29d909ef 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -34,7 +34,16 @@ pg_validate_utf8_fallback(const unsigned char *s, int len)
 	{
 		int			l;
 
-		/* ASCII */
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-- 
2.22.0

v6-0004-Widen-the-ASCII-fast-path-stride-in-the-fallback-.patchapplication/octet-stream; name=v6-0004-Widen-the-ASCII-fast-path-stride-in-the-fallback-.patchDownload

From e08cc93238e647c8d13ecd6acaaa85ab7bca9ec2 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 11:52:58 -0400
Subject: [PATCH v6 4/4] Widen the ASCII fast path stride in the fallback UTF-8
 validator from 8 to 16 bytes.

---
 src/include/port/pg_utf8.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index a19fc55c1e..89132243b0 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -58,22 +58,24 @@ extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
 static inline int
 check_ascii(const unsigned char *s, int len)
 {
-	uint64		chunk,
+	uint64		half1,
+				half2,
 				highbits_set;
 
-	if (len >= sizeof(uint64))
+	if (len >= 2 * sizeof(uint64))
 	{
-		memcpy(&chunk, s, sizeof(uint64));
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
 
 		/* If there are zero bytes, bail and let the slow path handle it. */
-		if (HAS_ZERO(chunk))
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
 			return 0;
 
 		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = (chunk & UINT64CONST(0x8080808080808080));
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
 
 		if (!highbits_set)
-			return sizeof(uint64);
+			return 2 * sizeof(uint64);
 		else
 			return 0;
 	}
-- 
2.22.0

#18

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#17)

4 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

v7 fixes an obvious mistake in Solution.pm

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v7-0001-Add-noError-argument-to-encoding-conversion-funct.patchapplication/octet-stream; name=v7-0001-Add-noError-argument-to-encoding-conversion-funct.patchDownload

From 0688a7cfda94b9b85891954fd5899b2a330f266f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Sun, 7 Feb 2021 17:10:12 +0200
Subject: [PATCH v7 1/4] Add 'noError' argument to encoding conversion
 functions.

With the 'noError' argument, you can try to convert a buffer without
knowing the character boundaries beforehand. The functions now need to
return the number of input bytes successfully converted.

This is is a backwards-incompatible change, if you have created a custom
encoding conversion with CREATE CONVERSION. This adds a check to
pg_upgrade for that, refusing the upgrade if there are any user-defined
encoding conversions.

Add regression tests for built-in encoding conversions. This doesn't cover
every conversion, but it covers all the internal functions in conv.c that
are used to implement the conversions.
---
 doc/src/sgml/ref/create_conversion.sgml       |   5 +-
 src/backend/commands/conversioncmds.c         |  30 +-
 src/backend/utils/error/elog.c                |   2 +
 src/backend/utils/mb/conv.c                   | 139 ++++-
 .../cyrillic_and_mic/cyrillic_and_mic.c       | 127 +++--
 .../euc2004_sjis2004/euc2004_sjis2004.c       |  94 +++-
 .../euc_cn_and_mic/euc_cn_and_mic.c           |  57 +-
 .../euc_jp_and_sjis/euc_jp_and_sjis.c         | 153 ++++--
 .../euc_kr_and_mic/euc_kr_and_mic.c           |  57 +-
 .../euc_tw_and_big5/euc_tw_and_big5.c         | 165 ++++--
 .../latin2_and_win1250/latin2_and_win1250.c   |  49 +-
 .../latin_and_mic/latin_and_mic.c             |  43 +-
 .../utf8_and_big5/utf8_and_big5.c             |  37 +-
 .../utf8_and_cyrillic/utf8_and_cyrillic.c     |  67 ++-
 .../utf8_and_euc2004/utf8_and_euc2004.c       |  37 +-
 .../utf8_and_euc_cn/utf8_and_euc_cn.c         |  37 +-
 .../utf8_and_euc_jp/utf8_and_euc_jp.c         |  37 +-
 .../utf8_and_euc_kr/utf8_and_euc_kr.c         |  37 +-
 .../utf8_and_euc_tw/utf8_and_euc_tw.c         |  37 +-
 .../utf8_and_gb18030/utf8_and_gb18030.c       |  37 +-
 .../utf8_and_gbk/utf8_and_gbk.c               |  37 +-
 .../utf8_and_iso8859/utf8_and_iso8859.c       |  43 +-
 .../utf8_and_iso8859_1/utf8_and_iso8859_1.c   |  35 +-
 .../utf8_and_johab/utf8_and_johab.c           |  37 +-
 .../utf8_and_sjis/utf8_and_sjis.c             |  37 +-
 .../utf8_and_sjis2004/utf8_and_sjis2004.c     |  37 +-
 .../utf8_and_uhc/utf8_and_uhc.c               |  37 +-
 .../utf8_and_win/utf8_and_win.c               |  43 +-
 src/backend/utils/mb/mbutils.c                |  76 ++-
 src/bin/pg_upgrade/check.c                    |  95 ++++
 src/include/catalog/pg_proc.dat               | 332 +++++------
 src/include/mb/pg_wchar.h                     |  35 +-
 src/test/regress/expected/conversion.out      | 519 ++++++++++++++++++
 src/test/regress/expected/opr_sanity.out      |   7 +-
 .../regress/input/create_function_1.source    |   4 +
 .../regress/output/create_function_1.source   |   3 +
 src/test/regress/regress.c                    | 134 +++++
 src/test/regress/sql/conversion.sql           | 185 +++++++
 src/test/regress/sql/opr_sanity.sql           |   7 +-
 39 files changed, 2322 insertions(+), 628 deletions(-)

diff --git a/doc/src/sgml/ref/create_conversion.sgml b/doc/src/sgml/ref/create_conversion.sgml
index e7700fecfc..f014a676c8 100644
--- a/doc/src/sgml/ref/create_conversion.sgml
+++ b/doc/src/sgml/ref/create_conversion.sgml
@@ -117,8 +117,9 @@ conv_proc(
     integer,  -- destination encoding ID
     cstring,  -- source string (null terminated C string)
     internal, -- destination (fill with a null terminated C string)
-    integer   -- source string length
-) RETURNS void;
+    integer,  -- source string length
+    boolean   -- if true, don't throw an error if conversion fails
+) RETURNS integer;
 </programlisting></para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c
index f7ff321de7..59e7300020 100644
--- a/src/backend/commands/conversioncmds.c
+++ b/src/backend/commands/conversioncmds.c
@@ -45,8 +45,9 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	const char *from_encoding_name = stmt->for_encoding_name;
 	const char *to_encoding_name = stmt->to_encoding_name;
 	List	   *func_name = stmt->func_name;
-	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID};
+	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID, BOOLOID};
 	char		result[1];
+	Datum		funcresult;
 
 	/* Convert list of names to a name and namespace */
 	namespaceId = QualifiedNameGetCreationNamespace(stmt->conversion_name,
@@ -92,8 +93,8 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	funcoid = LookupFuncName(func_name, sizeof(funcargs) / sizeof(Oid),
 							 funcargs, false);
 
-	/* Check it returns VOID, else it's probably the wrong function */
-	if (get_func_rettype(funcoid) != VOIDOID)
+	/* Check it returns int4, else it's probably the wrong function */
+	if (get_func_rettype(funcoid) != INT4OID)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 				 errmsg("encoding conversion function %s must return type %s",
@@ -111,12 +112,23 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	 * string; the conversion function should throw an error if it can't
 	 * perform the requested conversion.
 	 */
-	OidFunctionCall5(funcoid,
-					 Int32GetDatum(from_encoding),
-					 Int32GetDatum(to_encoding),
-					 CStringGetDatum(""),
-					 CStringGetDatum(result),
-					 Int32GetDatum(0));
+	funcresult = OidFunctionCall6(funcoid,
+								  Int32GetDatum(from_encoding),
+								  Int32GetDatum(to_encoding),
+								  CStringGetDatum(""),
+								  CStringGetDatum(result),
+								  Int32GetDatum(0),
+								  BoolGetDatum(false));
+
+	/*
+	 * The function should return 0 for empty input. Might as well check that,
+	 * too.
+	 */
+	if (DatumGetInt32(funcresult) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("encoding conversion function %s returned incorrect result for empty input",
+						NameListToString(func_name))));
 
 	/*
 	 * All seem ok, go ahead (possible failure would be a duplicate conversion
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 80c2672461..762f77d533 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2280,6 +2280,8 @@ write_console(const char *line, int len)
 	 * Conversion on non-win32 platforms is not implemented yet. It requires
 	 * non-throw version of pg_do_encoding_conversion(), that converts
 	 * unconvertable characters to '?' without errors.
+	 *
+	 * XXX: We have a no-throw version now. It doesn't convert to '?' though.
 	 */
 #endif
 
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a07b54bd3b..33e9c9a9e3 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -25,15 +25,20 @@
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
-			const unsigned char *tab)
+			const unsigned char *tab,
+			bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -41,7 +46,11 @@ local2local(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -50,13 +59,19 @@ local2local(const unsigned char *l,
 			if (c2)
 				*p++ = c2;
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -66,18 +81,26 @@ local2local(const unsigned char *l,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = l;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
@@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
+			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 
@@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l,
 				*p++ = c2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l,
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;
 
@@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic,
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
@@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic,
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
@@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt,
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
+	const unsigned char *start = utf;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len,
 
 			l = pg_utf_mblen(utf);
 			if (len < l)
+			{
+				/* need more data to decide if this is a combined char */
+				utf -= l_save;
 				break;
+			}
 
 			if (!pg_utf8_islegal(utf, l))
+			{
+				if (!noError)
+					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+				utf -= l_save;
 				break;
+			}
 
 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
@@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* failed to translate this character */
+		utf -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(PG_UTF8, encoding,
-								   (const char *) (utf - l), len);
+								   (const char *) utf, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 
 	*iso = '\0';
+
+	return utf - start;
 }
 
 /*
@@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len,
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding,
+		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
+	const unsigned char *start = iso;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len,
 		}
 
 		/* failed to translate this character */
+		iso -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(encoding, PG_UTF8,
-								   (const char *) (iso - l), len);
+								   (const char *) iso, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);
 
 	*utf = '\0';
+
+	return iso - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
index 4c5b02654d..368c2deb5e 100644
--- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
@@ -44,8 +44,11 @@ PG_FUNCTION_INFO_V1(win866_to_iso);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -306,12 +309,14 @@ koi8r_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -320,12 +325,14 @@ mic_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_KOI8R);
 
-	mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -334,12 +341,14 @@ iso_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -348,12 +357,14 @@ mic_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_ISO_8859_5);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -362,12 +373,14 @@ win1251_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -376,12 +389,14 @@ mic_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1251);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -390,12 +405,14 @@ win866_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -404,12 +421,14 @@ mic_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN866);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -418,12 +437,14 @@ koi8r_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN1251);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -432,12 +453,14 @@ win1251_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -446,12 +469,14 @@ koi8r_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN866);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -460,12 +485,14 @@ win866_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi);
+	converted = local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -474,12 +501,14 @@ win866_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_WIN1251);
 
-	local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251);
+	converted = local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -488,12 +517,14 @@ win1251_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_WIN866);
 
-	local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -502,12 +533,14 @@ iso_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_KOI8R);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -516,12 +549,14 @@ koi8r_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -530,12 +565,14 @@ iso_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN1251);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -544,12 +581,14 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -558,12 +597,14 @@ iso_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN866);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -572,10 +613,12 @@ win866_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso);
+	converted = local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
index 4d7fb116cf..a3fd35bd40 100644
--- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
@@ -19,8 +19,8 @@ PG_MODULE_MAGIC;
 PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004);
 PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004);
 
-static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len);
-static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len);
+static int	euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError);
 
 /* ----------
  * conv_proc(
@@ -28,8 +28,11 @@ static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -39,12 +42,14 @@ euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004);
 
-	euc_jis_20042shift_jis_2004(src, dest, len);
+	converted = euc_jis_20042shift_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -53,20 +58,23 @@ shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004);
 
-	shift_jis_20042euc_jis_2004(src, dest, len);
+	converted = shift_jis_20042euc_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_JIS_2004 -> SHIFT_JIS_2004
  */
-static void
-euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				ku,
 				ten;
@@ -79,8 +87,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -90,8 +102,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
 
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		if (c1 == SS2 && l == 2)	/* JIS X 0201 kana? */
 		{
@@ -121,8 +137,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 						*p++ = (ku + 0x19b) >> 1;
 					}
 					else
+					{
+						if (noError)
+							break;
 						report_invalid_encoding(PG_EUC_JIS_2004,
 												(const char *) euc, len);
+					}
 			}
 
 			if (ku % 2)
@@ -132,8 +152,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
@@ -149,8 +173,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 			else if (ku >= 63 && ku <= 94)
 				*p++ = (ku + 0x181) >> 1;
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 
 			if (ku % 2)
 			{
@@ -159,20 +187,30 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		euc += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
@@ -212,9 +250,10 @@ get_ten(int b, int *ku)
  * SHIFT_JIS_2004 ---> EUC_JIS_2004
  */
 
-static void
-shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len)
+static int
+shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1;
 	int			ku,
 				ten,
@@ -230,8 +269,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -241,8 +284,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
 
 		if (l < 0 || l > len)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SHIFT_JIS_2004,
 									(const char *) sjis, len);
+		}
 
 		if (c1 >= 0xa1 && c1 <= 0xdf && l == 1)
 		{
@@ -266,8 +313,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x100;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xe0 && c1 <= 0xef)	/* plane 1 62ku-94ku */
@@ -275,9 +326,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x180;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
-
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xf0 && c1 <= 0xf3)	/* plane 2
@@ -286,8 +340,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				switch (c1)
 				{
 					case 0xf0:
@@ -309,16 +367,24 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				if (c1 == 0xf4 && kubun == 1)
 					ku = 15;
 				else
 					ku = (c1 << 1) - 0x19a - kubun;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 
 			if (plane == 2)
 				*p++ = SS3;
@@ -330,4 +396,6 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
index e9bb896935..09b3c2e75b 100644
--- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_cn_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_MULE_INTERNAL);
 
-	euc_cn2mic(src, dest, len);
+	converted = euc_cn2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_CN);
 
-	mic2euc_cn(src, dest, len);
+	converted = mic2euc_cn(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_CN ---> MIC
  */
-static void
-euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 
 	while (len > 0)
@@ -76,7 +84,11 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = LC_GB2312_80;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -86,21 +98,28 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_CN
  */
-static void
-mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
@@ -109,11 +128,19 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (c1 != LC_GB2312_80)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
 										   (const char *) mic, len);
+			}
 			if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			mic++;
 			*p++ = *mic++;
 			*p++ = *mic++;
@@ -122,12 +149,18 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
index 5059f917a9..2e68708893 100644
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -42,17 +42,20 @@ PG_FUNCTION_INFO_V1(mic_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static int	sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
+static int	mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -60,12 +63,14 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_SJIS);
 
-	euc_jp2sjis(src, dest, len);
+	converted = euc_jp2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -74,12 +79,14 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_EUC_JP);
 
-	sjis2euc_jp(src, dest, len);
+	converted = sjis2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -88,12 +95,14 @@ euc_jp_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_MULE_INTERNAL);
 
-	euc_jp2mic(src, dest, len);
+	converted = euc_jp2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -102,12 +111,14 @@ mic_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_JP);
 
-	mic2euc_jp(src, dest, len);
+	converted = mic2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -116,12 +127,14 @@ sjis_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_MULE_INTERNAL);
 
-	sjis2mic(src, dest, len);
+	converted = sjis2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -130,20 +143,23 @@ mic_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_SJIS);
 
-	mic2sjis(src, dest, len);
+	converted = mic2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * SJIS ---> MIC
  */
-static void
-sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -167,7 +183,11 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 			 * JIS X0208, X0212, user defined extended characters
 			 */
 			if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			c2 = sjis[1];
 			k = (c1 << 8) + c2;
 			if (k >= 0xed40 && k < 0xf040)
@@ -257,21 +277,28 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
 
 /*
  * MIC ---> SJIS
  */
-static void
-mic2sjis(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1,
 				c2,
 				k,
@@ -284,8 +311,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -293,8 +324,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 			*p++ = mic[1];
 		else if (c1 == LC_JISX0208)
@@ -350,20 +385,27 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 			}
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP ---> MIC
  */
-static void
-euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -374,8 +416,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -383,8 +429,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{						/* 1 byte kana? */
 			*p++ = LC_JISX0201K;
@@ -406,14 +456,17 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_JP
  */
-static void
-mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -424,8 +477,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -433,8 +490,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 		{
 			*p++ = SS2;
@@ -452,20 +513,27 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP -> SJIS
  */
-static void
-euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				c2,
 				k;
@@ -478,8 +546,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -487,8 +559,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{
 			/* hankaku kana? */
@@ -551,14 +627,17 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * SJIS ---> EUC_JP
  */
-static void
-sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -573,8 +652,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -582,8 +665,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SJIS,
 									(const char *) sjis, len);
+		}
 		if (c1 >= 0xa1 && c1 <= 0xdf)
 		{
 			/* JIS X0201 (1 byte kana) */
@@ -680,4 +767,6 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
index ac823d6c27..3b85f0c186 100644
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_kr_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_MULE_INTERNAL);
 
-	euc_kr2mic(src, dest, len);
+	converted = euc_kr2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_KR);
 
-	mic2euc_kr(src, dest, len);
+	converted = mic2euc_kr(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_KR ---> MIC
  */
-static void
-euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -78,8 +86,12 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = LC_KS5601;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -89,22 +101,29 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_KR
  */
-static void
-mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -115,8 +134,12 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -124,18 +147,28 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_KS5601)
 		{
 			*p++ = mic[1];
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
index 66c242d7f3..4bf8acda99 100644
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -32,17 +32,20 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
-static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
-static void big52mic(const unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
+static int	mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -50,12 +53,14 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
 
-	euc_tw2big5(src, dest, len);
+	converted = euc_tw2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -64,12 +69,14 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
 
-	big52euc_tw(src, dest, len);
+	converted = big52euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -78,12 +85,14 @@ euc_tw_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
 
-	euc_tw2mic(src, dest, len);
+	converted = euc_tw2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -92,12 +101,14 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
 
-	mic2euc_tw(src, dest, len);
+	converted = mic2euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -106,12 +117,14 @@ big5_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
 
-	big52mic(src, dest, len);
+	converted = big52mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -120,21 +133,24 @@ mic_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
 
-	mic2big5(src, dest, len);
+	converted = mic2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 
 /*
  * EUC_TW ---> Big5
  */
-static void
-euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	unsigned char c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -149,8 +165,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Verify and decode the next EUC_TW input character */
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -171,8 +191,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Write it out in Big5 */
 			big5buf = CNStoBIG5(cnsBuf, lc);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_EUC_TW, PG_BIG5,
 										   (const char *) euc, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 
@@ -182,22 +206,29 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * Big5 ---> EUC_TW
  */
-static void
-big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -212,8 +243,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			big5buf = (c1 << 8) | big5[1];
 			cnsBuf = BIG5toCNS(big5buf, &lc);
 
@@ -237,8 +272,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 				*p++ = cnsBuf & 0x00ff;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_BIG5, PG_EUC_TW,
 										   (const char *) big5, len);
+			}
 
 			big5 += l;
 			len -= l;
@@ -256,14 +295,17 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		}
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * EUC_TW ---> MIC
  */
-static void
-euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -274,8 +316,12 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -304,22 +350,29 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_TW
  */
-static void
-mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -330,8 +383,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -339,8 +396,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1)
 		{
 			*p++ = mic[1];
@@ -362,20 +423,27 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[3];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * Big5 ---> MIC
  */
-static void
-big52mic(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -389,8 +457,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			*p++ = c1;
 			big5++;
 			len--;
@@ -398,8 +470,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_BIG5,
 									(const char *) big5, len);
+		}
 		big5buf = (c1 << 8) | big5[1];
 		cnsBuf = BIG5toCNS(big5buf, &lc);
 		if (lc != 0)
@@ -412,20 +488,27 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 			*p++ = cnsBuf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
 									   (const char *) big5, len);
+		}
 		big5 += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * MIC ---> Big5
  */
-static void
-mic2big5(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -438,8 +521,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -447,8 +534,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
 		{
 			if (c1 == LCPRV2_B)
@@ -462,16 +553,26 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 			}
 			big5buf = CNStoBIG5(cnsBuf, c1);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 										   (const char *) mic, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
index 2e28e6780a..8610fcb69a 100644
--- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
+++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(win1250_to_latin2);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -82,12 +85,14 @@ latin2_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -96,12 +101,14 @@ mic_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN2);
 
-	mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -110,13 +117,15 @@ win1250_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 win1250_2_iso88592);
+	converted = latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -125,13 +134,15 @@ mic_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1250);
 
-	mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 iso88592_2_win1250);
+	converted = mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -140,12 +151,15 @@ latin2_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_WIN1250);
 
-	local2local(src, dest, len, PG_LATIN2, PG_WIN1250, iso88592_2_win1250);
+	converted = local2local(src, dest, len, PG_LATIN2, PG_WIN1250,
+							iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -154,10 +168,13 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_LATIN2);
 
-	local2local(src, dest, len, PG_WIN1250, PG_LATIN2, win1250_2_iso88592);
+	converted = local2local(src, dest, len, PG_WIN1250, PG_LATIN2,
+							win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
index bc651410f2..bff27d1c29 100644
--- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(mic_to_latin4);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -42,12 +45,14 @@ latin1_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,12 +61,14 @@ mic_to_latin1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN1);
 
-	mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -70,12 +77,14 @@ latin3_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN3, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -84,12 +93,14 @@ mic_to_latin3(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN3);
 
-	mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,12 +109,14 @@ latin4_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN4, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -112,10 +125,12 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN4);
 
-	mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
index d6067cdc24..3838b15cab 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ big5_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &big5_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = LocalToUtf(src, len, dest,
+						   &big5_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5);
 
-	UtfToLocal(src, len, dest,
-			   &big5_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = UtfToLocal(src, len, dest,
+						   &big5_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
index ed90e8e682..75719fe5f1 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
@@ -33,8 +33,11 @@ PG_FUNCTION_INFO_V1(koi8u_to_utf8);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -44,16 +47,19 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R);
 
-	UtfToLocal(src, len, dest,
-			   &koi8r_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8r_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -62,16 +68,19 @@ koi8r_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8r_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8r_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -80,16 +89,19 @@ utf8_to_koi8u(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U);
 
-	UtfToLocal(src, len, dest,
-			   &koi8u_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8u_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,14 +110,17 @@ koi8u_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8u_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8u_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
index d699affce4..5391001951 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jis_2004_to_unicode_tree,
-			   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jis_2004_to_unicode_tree,
+						   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jis_2004_from_unicode_tree,
-			   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jis_2004_from_unicode_tree,
+						   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
index d7c0ba6a58..c87d1bf239 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_cn_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_cn_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN);
 
-	UtfToLocal(src, len, dest,
-			   &euc_cn_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_cn_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
index 13a3a23e77..6a55134db2 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jp);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jp_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jp_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jp_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jp_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
index 1bbb8aaef7..fe1924e2fe 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_kr_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_kr_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR);
 
-	UtfToLocal(src, len, dest,
-			   &euc_kr_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_kr_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
index 9830045dcc..68215659b5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_tw);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_tw_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_tw_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW);
 
-	UtfToLocal(src, len, dest,
-			   &euc_tw_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_tw_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index f86ecf2742..e1a59c39a4 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -183,8 +183,11 @@ conv_utf8_to_18030(uint32 code)
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -193,16 +196,19 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gb18030_to_unicode_tree,
-			   NULL, 0,
-			   conv_18030_to_utf8,
-			   PG_GB18030);
+	converted = LocalToUtf(src, len, dest,
+						   &gb18030_to_unicode_tree,
+						   NULL, 0,
+						   conv_18030_to_utf8,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -211,14 +217,17 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, len, dest,
-			   &gb18030_from_unicode_tree,
-			   NULL, 0,
-			   conv_utf8_to_18030,
-			   PG_GB18030);
+	converted = UtfToLocal(src, len, dest,
+						   &gb18030_from_unicode_tree,
+						   NULL, 0,
+						   conv_utf8_to_18030,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
index 2ab8b16c8a..881386d534 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_gbk);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ gbk_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gbk_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = LocalToUtf(src, len, dest,
+						   &gbk_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK);
 
-	UtfToLocal(src, len, dest,
-			   &gbk_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = UtfToLocal(src, len, dest,
+						   &gbk_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
index 3e49f67ea2..d93a521bad 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
@@ -52,8 +52,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -100,6 +103,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -108,12 +112,15 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -122,7 +129,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -132,6 +139,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -140,12 +148,15 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -154,5 +165,5 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
index 67e713cca1..d0dc4cca37 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
@@ -26,8 +26,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -37,6 +40,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
@@ -45,7 +50,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+		}
 		if (!IS_HIGHBIT_SET(c))
 			*dest++ = c;
 		else
@@ -58,7 +67,7 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
 
 Datum
@@ -67,6 +76,8 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c,
 				c1;
 
@@ -76,7 +87,11 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_UTF8, (const char *) src, len);
+		}
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(c))
 		{
@@ -89,10 +104,18 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 			int			l = pg_utf_mblen(src);
 
 			if (l > len || !pg_utf8_islegal(src, l))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_UTF8, (const char *) src, len);
+			}
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 			c1 = src[1] & 0x3f;
 			c = ((c & 0x1f) << 6) | c1;
 			if (c >= 0x80 && c <= 0xff)
@@ -102,11 +125,15 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 				len -= 2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 		}
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
index 578f5df4e7..317daa2d5e 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_johab);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ johab_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &johab_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = LocalToUtf(src, len, dest,
+						   &johab_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_johab(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB);
 
-	UtfToLocal(src, len, dest,
-			   &johab_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = UtfToLocal(src, len, dest,
+						   &johab_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
index dd9fc2975a..4c9348aba5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ sjis_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &sjis_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = LocalToUtf(src, len, dest,
+						   &sjis_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS);
 
-	UtfToLocal(src, len, dest,
-			   &sjis_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = UtfToLocal(src, len, dest,
+						   &sjis_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
index 4bcc886d67..1fffdc5930 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_shift_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &shift_jis_2004_to_unicode_tree,
-			   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &shift_jis_2004_to_unicode_tree,
+						   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &shift_jis_2004_from_unicode_tree,
-			   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &shift_jis_2004_from_unicode_tree,
+						   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
index c8e512994a..d9471dad09 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_uhc);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ uhc_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &uhc_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = LocalToUtf(src, len, dest,
+						   &uhc_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC);
 
-	UtfToLocal(src, len, dest,
-			   &uhc_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = UtfToLocal(src, len, dest,
+						   &uhc_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
index 0c9493dee5..110ba5677d 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
@@ -48,8 +48,11 @@ PG_FUNCTION_INFO_V1(utf8_to_win);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -81,6 +84,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -89,12 +93,15 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -103,7 +110,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -113,6 +120,7 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -121,12 +129,15 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -135,5 +146,5 @@ utf8_to_win(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 2578573b0a..877d57eee5 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	OidFunctionCall5(proc,
-					 Int32GetDatum(src_encoding),
-					 Int32GetDatum(dest_encoding),
-					 CStringGetDatum(src),
-					 CStringGetDatum(result),
-					 Int32GetDatum(len));
+	(void) OidFunctionCall6(proc,
+							Int32GetDatum(src_encoding),
+							Int32GetDatum(dest_encoding),
+							CStringGetDatum(src),
+							CStringGetDatum(result),
+							Int32GetDatum(len),
+							BoolGetDatum(false));
 
 	/*
 	 * If the result is large, it's worth repalloc'ing to release any extra
@@ -435,6 +436,59 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 	return result;
 }
 
+/*
+ * Convert src string to another encoding.
+ *
+ * This function has a different API than the other conversion functions.
+ * The caller should've looked up the conversion function using
+ * FindDefaultConversionProc(). Unlike the other functions, the converted
+ * result is not palloc'd. It is written to a caller-supplied buffer instead.
+ *
+ * src_encoding   - encoding to convert from
+ * dest_encoding  - encoding to convert to
+ * src, srclen    - input buffer and its length in bytes
+ * dest, destlen  - destination buffer and its size in bytes
+ *
+ * The output is null-terminated.
+ *
+ * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
+ * wouldn't necessarily fit in the output buffer, and the function will not
+ * convert the whole input.
+ *
+ * TODO: It would be nice to also return the number of bytes written to the
+ * caller, to avoid a call to strlen().
+ */
+int
+pg_do_encoding_conversion_buf(Oid proc,
+							  int src_encoding,
+							  int dest_encoding,
+							  unsigned char *src, int srclen,
+							  unsigned char *dest, int destlen,
+							  bool noError)
+{
+	Datum		result;
+
+	/*
+	 * If the destination buffer is not large enough to hold the result in the
+	 * worst case, limit the input size passed to the conversion function.
+	 *
+	 * TODO: It would perhaps be more efficient to pass the destination buffer
+	 * size to the conversion function, so that if the conversion expands less
+	 * than the worst case, it could continue to fill up the whole buffer.
+	 */
+	if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
+		srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
+
+	result = OidFunctionCall6(proc,
+							  Int32GetDatum(src_encoding),
+							  Int32GetDatum(dest_encoding),
+							  CStringGetDatum(src),
+							  CStringGetDatum(dest),
+							  Int32GetDatum(srclen),
+							  BoolGetDatum(noError));
+	return DatumGetInt32(result);
+}
+
 /*
  * Convert string to encoding encoding_name. The source
  * encoding is the DB encoding.
@@ -762,12 +816,13 @@ perform_default_encoding_conversion(const char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	FunctionCall5(flinfo,
+	FunctionCall6(flinfo,
 				  Int32GetDatum(src_encoding),
 				  Int32GetDatum(dest_encoding),
 				  CStringGetDatum(src),
 				  CStringGetDatum(result),
-				  Int32GetDatum(len));
+				  Int32GetDatum(len),
+				  BoolGetDatum(false));
 
 	/*
 	 * Release extra space if there might be a lot --- see comments in
@@ -849,12 +904,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
 	c_as_utf8[c_as_utf8_len] = '\0';
 
 	/* Convert, or throw error if we can't */
-	FunctionCall5(Utf8ToServerConvProc,
+	FunctionCall6(Utf8ToServerConvProc,
 				  Int32GetDatum(PG_UTF8),
 				  Int32GetDatum(server_encoding),
 				  CStringGetDatum(c_as_utf8),
 				  CStringGetDatum(s),
-				  Int32GetDatum(c_as_utf8_len));
+				  Int32GetDatum(c_as_utf8_len),
+				  BoolGetDatum(false));
 }
 
 
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 43fc297eb6..ee6be95b08 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -28,6 +28,7 @@ static void check_for_reg_data_type_usage(ClusterInfo *cluster);
 static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(ClusterInfo *new_cluster);
+static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
 static char *get_canonical_locale_name(int category, const char *locale);
 
 
@@ -102,6 +103,15 @@ check_and_dump_old_cluster(bool live_check)
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
 
+	/*
+	 * PG 14 changed the function signature of encoding conversion functions.
+	 * Conversions from older versions cannot be upgraded automatically
+	 * because the user-defined functions used by the encoding conversions
+	 * need to changed to match the new signature.
+	 */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1300)
+		check_for_user_defined_encoding_conversions(&old_cluster);
+
 	/*
 	 * Pre-PG 14 allowed user defined postfix operators, which are not
 	 * supported anymore.  Verify there are none, iff applicable.
@@ -1268,6 +1278,91 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
 	check_ok();
 }
 
+/*
+ * Verify that no user-defined encoding conversions exist.
+ */
+static void
+check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
+{
+	int			dbnum;
+	FILE	   *script = NULL;
+	bool		found = false;
+	char		output_path[MAXPGPATH];
+
+	prep_status("Checking for user-defined encoding conversions");
+
+	snprintf(output_path, sizeof(output_path),
+			 "encoding_conversions.txt");
+
+	/* Find any user defined encoding conversions */
+	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
+	{
+		PGresult   *res;
+		bool		db_used = false;
+		int			ntups;
+		int			rowno;
+		int			i_conoid,
+					i_conname,
+					i_nspname;
+		DbInfo	   *active_db = &cluster->dbarr.dbs[dbnum];
+		PGconn	   *conn = connectToServer(cluster, active_db->db_name);
+
+		/*
+		 * The query below hardcodes FirstNormalObjectId as 16384 rather than
+		 * interpolating that C #define into the query because, if that
+		 * #define is ever changed, the cutoff we want to use is the value
+		 * used by pre-version 14 servers, not that of some future version.
+		 */
+		res = executeQueryOrDie(conn,
+								"SELECT c.oid as conoid, c.conname, n.nspname "
+								"FROM pg_catalog.pg_conversion c, "
+								"     pg_catalog.pg_namespace n "
+								"WHERE c.connamespace = n.oid AND "
+								"      c.oid >= 16384");
+		ntups = PQntuples(res);
+		i_conoid = PQfnumber(res, "conoid");
+		i_conname = PQfnumber(res, "conname");
+		i_nspname = PQfnumber(res, "nspname");
+		for (rowno = 0; rowno < ntups; rowno++)
+		{
+			found = true;
+			if (script == NULL &&
+				(script = fopen_priv(output_path, "w")) == NULL)
+				pg_fatal("could not open file \"%s\": %s\n",
+						 output_path, strerror(errno));
+			if (!db_used)
+			{
+				fprintf(script, "In database: %s\n", active_db->db_name);
+				db_used = true;
+			}
+			fprintf(script, "  (oid=%s) %s.%s\n",
+					PQgetvalue(res, rowno, i_conoid),
+					PQgetvalue(res, rowno, i_nspname),
+					PQgetvalue(res, rowno, i_conname));
+		}
+
+		PQclear(res);
+
+		PQfinish(conn);
+	}
+
+	if (script)
+		fclose(script);
+
+	if (found)
+	{
+		pg_log(PG_REPORT, "fatal\n");
+		pg_fatal("Your installation contains user-defined encoding conversions.\n"
+				 "The conversion function parameters changed in PostgreSQL version 14\n"
+				 "so this cluster cannot currently be upgraded.  You can remove the\n"
+				 "encoding conversions in the old cluster and restart the upgrade.\n"
+				 "A list of user-defined encoding conversions is in the file:\n"
+				 "    %s\n\n", output_path);
+	}
+	else
+		check_ok();
+}
+
 
 /*
  * get_canonical_locale_name
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4e0c9be58c..12e228c7e0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10774,388 +10774,388 @@
 # conversion functions
 { oid => '4302',
   descr => 'internal conversion function for KOI8R to MULE_INTERNAL',
-  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_mic',
+  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4303',
   descr => 'internal conversion function for MULE_INTERNAL to KOI8R',
-  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_koi8r',
+  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4304',
   descr => 'internal conversion function for ISO-8859-5 to MULE_INTERNAL',
-  proname => 'iso_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_mic',
+  proname => 'iso_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4305',
   descr => 'internal conversion function for MULE_INTERNAL to ISO-8859-5',
-  proname => 'mic_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_iso',
+  proname => 'mic_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4306',
   descr => 'internal conversion function for WIN1251 to MULE_INTERNAL',
-  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_mic',
+  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4307',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1251',
-  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1251',
+  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4308',
   descr => 'internal conversion function for WIN866 to MULE_INTERNAL',
-  proname => 'win866_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_mic',
+  proname => 'win866_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4309',
   descr => 'internal conversion function for MULE_INTERNAL to WIN866',
-  proname => 'mic_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win866',
+  proname => 'mic_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4310', descr => 'internal conversion function for KOI8R to WIN1251',
-  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'koi8r_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4311', descr => 'internal conversion function for WIN1251 to KOI8R',
-  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_koi8r', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4312', descr => 'internal conversion function for KOI8R to WIN866',
-  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_win866',
+  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4313', descr => 'internal conversion function for WIN866 to KOI8R',
-  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_koi8r',
+  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4314',
   descr => 'internal conversion function for WIN866 to WIN1251',
-  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win866_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4315',
   descr => 'internal conversion function for WIN1251 to WIN866',
-  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_win866', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4316',
   descr => 'internal conversion function for ISO-8859-5 to KOI8R',
-  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_koi8r',
+  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4317',
   descr => 'internal conversion function for KOI8R to ISO-8859-5',
-  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_iso',
+  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4318',
   descr => 'internal conversion function for ISO-8859-5 to WIN1251',
-  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win1251',
+  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4319',
   descr => 'internal conversion function for WIN1251 to ISO-8859-5',
-  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_iso',
+  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4320',
   descr => 'internal conversion function for ISO-8859-5 to WIN866',
-  proname => 'iso_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win866',
+  proname => 'iso_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4321',
   descr => 'internal conversion function for WIN866 to ISO-8859-5',
-  proname => 'win866_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_iso',
+  proname => 'win866_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4322',
   descr => 'internal conversion function for EUC_CN to MULE_INTERNAL',
-  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_mic',
+  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_mic',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4323',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_CN',
-  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_cn',
+  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_cn',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4324', descr => 'internal conversion function for EUC_JP to SJIS',
-  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_sjis',
+  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4325', descr => 'internal conversion function for SJIS to EUC_JP',
-  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_euc_jp',
+  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4326',
   descr => 'internal conversion function for EUC_JP to MULE_INTERNAL',
-  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_mic',
+  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4327',
   descr => 'internal conversion function for SJIS to MULE_INTERNAL',
-  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_mic',
+  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4328',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_JP',
-  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_jp',
+  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4329',
   descr => 'internal conversion function for MULE_INTERNAL to SJIS',
-  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_sjis',
+  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4330',
   descr => 'internal conversion function for EUC_KR to MULE_INTERNAL',
-  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_mic',
+  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_mic',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4331',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_KR',
-  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_kr',
+  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_kr',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4332', descr => 'internal conversion function for EUC_TW to BIG5',
-  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_big5',
+  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4333', descr => 'internal conversion function for BIG5 to EUC_TW',
-  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_euc_tw',
+  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4334',
   descr => 'internal conversion function for EUC_TW to MULE_INTERNAL',
-  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_mic',
+  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4335',
   descr => 'internal conversion function for BIG5 to MULE_INTERNAL',
-  proname => 'big5_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_mic',
+  proname => 'big5_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4336',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_TW',
-  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_tw',
+  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4337',
   descr => 'internal conversion function for MULE_INTERNAL to BIG5',
-  proname => 'mic_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_big5',
+  proname => 'mic_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4338',
   descr => 'internal conversion function for LATIN2 to MULE_INTERNAL',
-  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin2_to_mic',
+  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4339',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN2',
-  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin2',
+  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin2',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4340',
   descr => 'internal conversion function for WIN1250 to MULE_INTERNAL',
-  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1250_to_mic',
+  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4341',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1250',
-  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1250',
+  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1250',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4342',
   descr => 'internal conversion function for LATIN2 to WIN1250',
-  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'latin2_to_win1250', probin => '$libdir/latin2_and_win1250' },
 { oid => '4343',
   descr => 'internal conversion function for WIN1250 to LATIN2',
-  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1250_to_latin2', probin => '$libdir/latin2_and_win1250' },
 { oid => '4344',
   descr => 'internal conversion function for LATIN1 to MULE_INTERNAL',
-  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin1_to_mic',
+  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin1_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4345',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN1',
-  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin1',
+  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin1',
   probin => '$libdir/latin_and_mic' },
 { oid => '4346',
   descr => 'internal conversion function for LATIN3 to MULE_INTERNAL',
-  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin3_to_mic',
+  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin3_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4347',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN3',
-  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin3',
+  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin3',
   probin => '$libdir/latin_and_mic' },
 { oid => '4348',
   descr => 'internal conversion function for LATIN4 to MULE_INTERNAL',
-  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin4_to_mic',
+  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin4_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4349',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN4',
-  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin4',
+  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin4',
   probin => '$libdir/latin_and_mic' },
 { oid => '4352', descr => 'internal conversion function for BIG5 to UTF8',
-  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_utf8',
+  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_utf8',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4353', descr => 'internal conversion function for UTF8 to BIG5',
-  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_big5',
+  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_big5',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4354', descr => 'internal conversion function for UTF8 to KOI8R',
-  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8r',
+  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8r',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4355', descr => 'internal conversion function for KOI8R to UTF8',
-  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_utf8',
+  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4356', descr => 'internal conversion function for UTF8 to KOI8U',
-  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8u',
+  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8u',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4357', descr => 'internal conversion function for KOI8U to UTF8',
-  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8u_to_utf8',
+  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8u_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4358', descr => 'internal conversion function for UTF8 to WIN',
-  proname => 'utf8_to_win', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_win',
+  proname => 'utf8_to_win', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_win',
   probin => '$libdir/utf8_and_win' },
 { oid => '4359', descr => 'internal conversion function for WIN to UTF8',
-  proname => 'win_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win_to_utf8',
+  proname => 'win_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win_to_utf8',
   probin => '$libdir/utf8_and_win' },
 { oid => '4360', descr => 'internal conversion function for EUC_CN to UTF8',
-  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_utf8',
+  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_utf8',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4361', descr => 'internal conversion function for UTF8 to EUC_CN',
-  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_cn',
+  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_cn',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4362', descr => 'internal conversion function for EUC_JP to UTF8',
-  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_utf8',
+  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_utf8',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4363', descr => 'internal conversion function for UTF8 to EUC_JP',
-  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_jp',
+  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jp',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4364', descr => 'internal conversion function for EUC_KR to UTF8',
-  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_utf8',
+  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_utf8',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4365', descr => 'internal conversion function for UTF8 to EUC_KR',
-  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_kr',
+  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_kr',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4366', descr => 'internal conversion function for EUC_TW to UTF8',
-  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_utf8',
+  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_utf8',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4367', descr => 'internal conversion function for UTF8 to EUC_TW',
-  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_tw',
+  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_tw',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4368', descr => 'internal conversion function for GB18030 to UTF8',
-  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gb18030_to_utf8',
+  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gb18030_to_utf8',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4369', descr => 'internal conversion function for UTF8 to GB18030',
-  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gb18030',
+  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gb18030',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4370', descr => 'internal conversion function for GBK to UTF8',
-  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gbk_to_utf8',
+  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gbk_to_utf8',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4371', descr => 'internal conversion function for UTF8 to GBK',
-  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gbk',
+  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gbk',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4372',
   descr => 'internal conversion function for UTF8 to ISO-8859 2-16',
-  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_iso8859',
+  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4373',
   descr => 'internal conversion function for ISO-8859 2-16 to UTF8',
-  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso8859_to_utf8',
+  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_to_utf8',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4374', descr => 'internal conversion function for LATIN1 to UTF8',
-  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'iso8859_1_to_utf8', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4375', descr => 'internal conversion function for UTF8 to LATIN1',
-  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_iso8859_1', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4376', descr => 'internal conversion function for JOHAB to UTF8',
-  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'johab_to_utf8',
+  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'johab_to_utf8',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4377', descr => 'internal conversion function for UTF8 to JOHAB',
-  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_johab',
+  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_johab',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4378', descr => 'internal conversion function for SJIS to UTF8',
-  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_utf8',
+  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_utf8',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4379', descr => 'internal conversion function for UTF8 to SJIS',
-  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_sjis',
+  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_sjis',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4380', descr => 'internal conversion function for UHC to UTF8',
-  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'uhc_to_utf8',
+  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'uhc_to_utf8',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4381', descr => 'internal conversion function for UTF8 to UHC',
-  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_uhc',
+  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_uhc',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4382',
   descr => 'internal conversion function for EUC_JIS_2004 to UTF8',
-  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_utf8', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4383',
   descr => 'internal conversion function for UTF8 to EUC_JIS_2004',
-  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_euc_jis_2004', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4384',
   descr => 'internal conversion function for SHIFT_JIS_2004 to UTF8',
-  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_utf8', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4385',
   descr => 'internal conversion function for UTF8 to SHIFT_JIS_2004',
-  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_shift_jis_2004', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4386',
   descr => 'internal conversion function for EUC_JIS_2004 to SHIFT_JIS_2004',
   proname => 'euc_jis_2004_to_shift_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_shift_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 { oid => '4387',
   descr => 'internal conversion function for SHIFT_JIS_2004 to EUC_JIS_2004',
   proname => 'shift_jis_2004_to_euc_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_euc_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 64b22e4b0d..bbce9071df 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -616,6 +616,12 @@ extern int	pg_bind_textdomain_codeset(const char *domainname);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 												int src_encoding,
 												int dest_encoding);
+extern int	pg_do_encoding_conversion_buf(Oid proc,
+										  int src_encoding,
+										  int dest_encoding,
+										  unsigned char *src, int srclen,
+										  unsigned char *dst, int dstlen,
+										  bool noError);
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
@@ -627,18 +633,18 @@ extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void UtfToLocal(const unsigned char *utf, int len,
+extern int	UtfToLocal(const unsigned char *utf, int len,
 					   unsigned char *iso,
 					   const pg_mb_radix_tree *map,
 					   const pg_utf_to_local_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
-extern void LocalToUtf(const unsigned char *iso, int len,
+					   int encoding, bool noError);
+extern int	LocalToUtf(const unsigned char *iso, int len,
 					   unsigned char *utf,
 					   const pg_mb_radix_tree *map,
 					   const pg_local_to_utf_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
+					   int encoding, bool noError);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
@@ -656,18 +662,19 @@ extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg
 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
 									   const char *mbstr, int len) pg_attribute_noreturn();
 
-extern void local2local(const unsigned char *l, unsigned char *p, int len,
-						int src_encoding, int dest_encoding, const unsigned char *tab);
-extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+extern int	local2local(const unsigned char *l, unsigned char *p, int len,
+						int src_encoding, int dest_encoding, const unsigned char *tab,
+						bool noError);
+extern int	latin2mic(const unsigned char *l, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	mic2latin(const unsigned char *mic, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	latin2mic_with_table(const unsigned char *l, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
-extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+								 const unsigned char *tab, bool noError);
+extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
+								 const unsigned char *tab, bool noError);
 
 #ifdef WIN32
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 62c1067168..e34ab20974 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -37,3 +37,522 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+--
+-- Test built-in conversion functions.
+--
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+                     description                      |        result        |   errorat    |                           error                           
+------------------------------------------------------+----------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382ab             | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
+ invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+                     description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
+------------------------------------------------------+----------------------+----------------+----------------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f       |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xa9daa9ec     |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0 |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fbedd   |                      | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5f7         |                      | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab         |                      | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x             | \xe382abe382         | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x             | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x       | \xc3a4c3b6           | character with byte sequence 0xc3 0xa4 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, cyrillic                                      | \xd184d0bed0be       | \xc6cfcf |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+                     description                      |       inbytes        |           result           |   errorat    |                           error                           
+------------------------------------------------------+----------------------+----------------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f                   |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x81308a3181308b32         |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0             |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fcff3               |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5ab8139a732             |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab                     |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \xa5ab                     | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x8334e5398238c4338330b335 |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                            error                             
+---------------------------------------+----------------+--------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fbedd |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xa5f7       |          | 
+ incomplete char                       | \xbeddbe       | \xbedd       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f     | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f     | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x           | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |     result     | errorat  |                            error                             
+---------------------------------------+----------------+----------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f       |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fe8b1a1 |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xe382abe3829a |          | 
+ incomplete char                       | \xbeddbe       | \xe8b1a1       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f       | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f       | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x             | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6f8fdb |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6f81c0 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6f82f5 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6f8fdb | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6f8fdb | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+              description              |    inbytes     |        result        | errorat  |                             error                              
+---------------------------------------+----------------+----------------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f             |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fe8b1a1       |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fe28a84       |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fe3818be3829a |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fe8b1a1       | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f             | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f             | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f             | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fe8b1a1       | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fbedd |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fa2c2 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fa4f7 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fbedd | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+                  description                   |      inbytes       |      result      |   errorat    |                               error                               
+------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fcff3     |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f8431a530 |              | 
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+                  description                   |      inbytes       |     result     |   errorat    |                                                    error                                                    
+------------------------------------------------+--------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f       |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fe8b1a1 |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f       | \x8431a530   | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8"
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xe4dede |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xe4     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xe4dede | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \xd184d0bed0be |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xd184         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xd184d0bed0be | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xc6cfcf |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xc6     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xc6cfcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \x8bc68bcf8bcf |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \x8bc6         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \x8bc68bcf8bcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+          description           |    inbytes     |    result    | errorat  |                        error                         
+--------------------------------+----------------+--------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f     |          | 
+ valid                          | \x666f6fb648   | \x666f6fb648 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6fa27f |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f     | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fb648 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                                             error                                              
+--------------------------------+----------------+----------------+----------+------------------------------------------------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6fe8b1a1 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f       | \xa27f   | character with byte sequence 0xa2 0x7f in encoding "BIG5" has no equivalent in encoding "UTF8"
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                        error                         
+--------------------------------+----------------+----------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6f95e2af |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f95a3c1 |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6f95e2af | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+        description        |     inbytes      |     result     |     errorat      |                               error                                
+---------------------------+------------------+----------------+------------------+--------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f       |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x8bc68bcf8bcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \x8bc68bcf     | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x92bedd       |                  | 
+ invalid, incomplete char) | \x92be           | \x             | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f95a3c1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f       | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x             | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x92bedd       | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x             | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                     
+---------------------------+------------------+----------+------------------+---------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xc6cfcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xc6cf   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                       error                                                        
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xe4dede |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xe4de   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                    error                                                     
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x8fdb   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x8fdb   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+        description        |     inbytes      |    result    |     errorat      |                                                    error                                                     
+---------------------------+------------------+--------------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f     |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x           | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x           | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x           | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, incomplete char) | \x92be           | \x           | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6fa2a1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f     | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x           | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x           | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x           | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                      
+---------------------------+------------------+----------+------------------+----------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ valid (in SHIFT_JIS)      | \x92bedd         | \xbedd   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \xbedd   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 254ca06d3d..23ba60e395 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1052,13 +1052,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
  oid | proname | oid | conname 
 -----+---------+-----+---------
 (0 rows)
diff --git a/src/test/regress/input/create_function_1.source b/src/test/regress/input/create_function_1.source
index 412e339fcf..6ba37fe63b 100644
--- a/src/test/regress/input/create_function_1.source
+++ b/src/test/regress/input/create_function_1.source
@@ -78,6 +78,10 @@ CREATE FUNCTION test_opclass_options_func(internal)
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
 
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
+
 -- Things that shouldn't work:
 
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
diff --git a/src/test/regress/output/create_function_1.source b/src/test/regress/output/create_function_1.source
index 4d78fa1228..cb38a039bf 100644
--- a/src/test/regress/output/create_function_1.source
+++ b/src/test/regress/output/create_function_1.source
@@ -68,6 +68,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
     RETURNS void
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
 -- Things that shouldn't work:
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
     AS 'SELECT ''not an integer'';';
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 32ab9ed6b5..1990cbb6a1 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -23,12 +23,15 @@
 #include "access/htup_details.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
 #include "commands/sequence.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "executor/spi.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
@@ -1060,3 +1063,134 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_NULL();
 }
+
+/*
+ * Call an encoding conversion or verification function.
+ *
+ * Arguments:
+ *	string	  bytea -- string to convert
+ *	src_enc	  name  -- source encoding
+ *	dest_enc  name  -- destination encoding
+ *	noError	  bool  -- if set, don't ereport() on invalid or untranslatable
+ *					   input
+ *
+ * Result is a tuple with two attributes:
+ *  int4	-- number of input bytes successfully converted
+ *  bytea	-- converted string
+ */
+PG_FUNCTION_INFO_V1(test_enc_conversion);
+Datum
+test_enc_conversion(PG_FUNCTION_ARGS)
+{
+	bytea	   *string = PG_GETARG_BYTEA_PP(0);
+	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+	int			src_encoding = pg_char_to_encoding(src_encoding_name);
+	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
+	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
+	bool		noError = PG_GETARG_BOOL(3);
+	TupleDesc	tupdesc;
+	char	   *src;
+	char	   *dst;
+	bytea	   *retval;
+	Size		srclen;
+	Size		dstsize;
+	Oid			proc;
+	int			convertedbytes;
+	int			dstlen;
+	Datum		values[2];
+	bool		nulls[2];
+	HeapTuple	tuple;
+
+	if (src_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid source encoding name \"%s\"",
+						src_encoding_name)));
+	if (dest_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid destination encoding name \"%s\"",
+						dest_encoding_name)));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	srclen = VARSIZE_ANY_EXHDR(string);
+	src = VARDATA_ANY(string);
+
+	if (src_encoding == dest_encoding)
+	{
+		/* just check that the source string is valid */
+		int			oklen;
+
+		oklen = pg_encoding_verifymbstr(src_encoding, src, srclen);
+
+		if (oklen == srclen)
+		{
+			convertedbytes = oklen;
+			retval = string;
+		}
+		else if (!noError)
+		{
+			report_invalid_encoding(src_encoding, src + oklen, srclen - oklen);
+		}
+		else
+		{
+			/*
+			 * build bytea data type structure.
+			 */
+			Assert(oklen < srclen);
+			convertedbytes = oklen;
+			retval = (bytea *) palloc(oklen + VARHDRSZ);
+			SET_VARSIZE(retval, oklen + VARHDRSZ);
+			memcpy(VARDATA(retval), src, oklen);
+		}
+	}
+	else
+	{
+		proc = FindDefaultConversionProc(src_encoding, dest_encoding);
+		if (!OidIsValid(proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
+							pg_encoding_to_char(src_encoding),
+							pg_encoding_to_char(dest_encoding))));
+
+		if (srclen >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of memory"),
+					 errdetail("String of %d bytes is too long for encoding conversion.",
+							   (int) srclen)));
+
+		dstsize = (Size) srclen * MAX_CONVERSION_GROWTH + 1;
+		dst = MemoryContextAlloc(CurrentMemoryContext, dstsize);
+
+		/* perform conversion */
+		convertedbytes = pg_do_encoding_conversion_buf(proc,
+													   src_encoding,
+													   dest_encoding,
+													   (unsigned char *) src, srclen,
+													   (unsigned char *) dst, dstsize,
+													   noError);
+		dstlen = strlen(dst);
+
+		/*
+		 * build bytea data type structure.
+		 */
+		retval = (bytea *) palloc(dstlen + VARHDRSZ);
+		SET_VARSIZE(retval, dstlen + VARHDRSZ);
+		memcpy(VARDATA(retval), dst, dstlen);
+
+		pfree(dst);
+	}
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = Int32GetDatum(convertedbytes);
+	values[1] = PointerGetDatum(retval);
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
+}
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 02cf39f1ce..ea85f20ed8 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -34,3 +34,188 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+
+--
+-- Test built-in conversion functions.
+--
+
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+
+
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index bbd3834b63..0469174598 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -556,13 +556,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
 
 -- Check for conprocs that don't perform the specific conversion that
 -- pg_conversion alleges they do, by trying to invoke each conversion
-- 
2.22.0

v7-0002-Use-SSE-4-for-verifying-UTF-8-text.patchapplication/octet-stream; name=v7-0002-Use-SSE-4-for-verifying-UTF-8-text.patchDownload

From cba2269d392c9c0c0e8ed6aaebe407acb159ea0c Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 17:41:59 -0400
Subject: [PATCH v7 2/4] Use SSE 4 for verifying UTF-8 text.

Replace pg_utf8_verifystr() with two faster implementations:

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.

On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 116 +++--
 configure.ac                             |  63 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  27 +-
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  51 +++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 120 +++++
 src/port/pg_utf8_sse42.c                 | 537 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  69 +++
 src/test/regress/expected/conversion.out |  52 +++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1049 insertions(+), 67 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index ce9ea36999..9b66a9f2c0 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17670,14 +17671,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17691,32 +17692,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17730,26 +17734,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -17884,12 +17891,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -17903,7 +17910,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -17956,6 +17963,63 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 07da84d401..4eb4042ea0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2017,14 +2017,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2065,12 +2065,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2084,7 +2084,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2121,6 +2121,53 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 55cab4d2bf..303dae4441 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -905,6 +905,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..636c637706
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..85a6bbd0eb
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* ASCII */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..d5ee029180
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The intesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_THREE_BYTE_LEAD 0xEF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_CONTINUATION 0xBF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share one value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+/* helper functions to wrap intrinsics */
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/*
+ * Shift entire "input" register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * "prev" register. Can be stated in C thusly:
+ *
+ * (prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	_mm_setr_epi8(0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF, 0xFF, 0xFF, 0xFF,
+				  0xFF,
+				  MAX_THREE_BYTE_LEAD,
+				  MAX_TWO_BYTE_LEAD,
+				  MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = _mm_setr_epi8(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = _mm_setr_epi8(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = _mm_setr_epi8(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero values in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. If the current chunk is ASCII, we don't need
+		 * to update prev_incomplete since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		unsigned char inbuf[sizeof(__m128i)];
+
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..263b840150
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 49614106dc..a92350d009 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -112,10 +112,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 2aa062b2c9..93b9df8e05 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -489,6 +489,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.22.0

v7-0003-Add-an-ASCII-fast-path-to-the-fallback-UTF-8-vali.patchapplication/octet-stream; name=v7-0003-Add-an-ASCII-fast-path-to-the-fallback-UTF-8-vali.patchDownload

From 97362f6bec5a8d6e016d8a6b8700c2d3c7e7b877 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 11:39:42 -0400
Subject: [PATCH v7 3/4] Add an ASCII fast path to the fallback UTF-8
 validator.

Using bitwise operations, we can check an entire 8-byte
chunk for both valid ASCII and zero bytes.
---
 src/include/port/pg_utf8.h  | 33 +++++++++++++++++++++++++++++++++
 src/port/pg_utf8_fallback.c | 11 ++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index 636c637706..a19fc55c1e 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -48,4 +48,37 @@ extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
 #define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
 #define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
 
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(chunk))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (chunk & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 #endif							/* PG_UTF8_H */
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 85a6bbd0eb..9a29d909ef 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -34,7 +34,16 @@ pg_validate_utf8_fallback(const unsigned char *s, int len)
 	{
 		int			l;
 
-		/* ASCII */
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-- 
2.22.0

v7-0004-Widen-the-ASCII-fast-path-stride-in-the-fallback-.patchapplication/octet-stream; name=v7-0004-Widen-the-ASCII-fast-path-stride-in-the-fallback-.patchDownload

From 17bc6a67c83893be9c4c4265e6de88e73aea42dc Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 24 Feb 2021 11:52:58 -0400
Subject: [PATCH v7 4/4] Widen the ASCII fast path stride in the fallback UTF-8
 validator from 8 to 16 bytes.

---
 src/include/port/pg_utf8.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index a19fc55c1e..89132243b0 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -58,22 +58,24 @@ extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
 static inline int
 check_ascii(const unsigned char *s, int len)
 {
-	uint64		chunk,
+	uint64		half1,
+				half2,
 				highbits_set;
 
-	if (len >= sizeof(uint64))
+	if (len >= 2 * sizeof(uint64))
 	{
-		memcpy(&chunk, s, sizeof(uint64));
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
 
 		/* If there are zero bytes, bail and let the slow path handle it. */
-		if (HAS_ZERO(chunk))
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
 			return 0;
 
 		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = (chunk & UINT64CONST(0x8080808080808080));
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
 
 		if (!highbits_set)
-			return sizeof(uint64);
+			return 2 * sizeof(uint64);
 		else
 			return 0;
 	}
-- 
2.22.0

#19

Amit Khandekar

amitdkhan.pg@gmail.com

almost 5 years ago

In reply to: John Naylor (#18)

Re: [POC] verifying UTF-8 using SIMD instructions

Hi,

Just a quick question before I move on to review the patch ... The
improvement looks like it is only meant for x86 platforms. Can this be
done in a portable way by arranging for auto-vectorization ? Something
like commit 88709176236caf. This way it would benefit other platforms
as well.

I tried to compile the following code using -O3, and the assembly does
have vectorized instructions.

#include <stdio.h>
int main()
{
int i;
char s1[200] = "abcdewhruerhetr";
char s2[200] = "oweurietiureuhtrethre";
char s3[200] = {0};

for (i = 0; i < sizeof(s1); i++)
{
s3[i] = s1[i] ^ s2[i];
}

printf("%s\n", s3);
}

#20

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Amit Khandekar (#19)

Re: [POC] verifying UTF-8 using SIMD instructions

On Tue, Mar 9, 2021 at 5:00 AM Amit Khandekar <amitdkhan.pg@gmail.com>
wrote:

Hi,

Just a quick question before I move on to review the patch ... The
improvement looks like it is only meant for x86 platforms.

Actually it's meant to be faster for all platforms, since the C fallback is
quite a bit different from HEAD. I've found it to be faster on ppc64le. An
earlier version of the patch was a loser on 32-bit Arm because of alignment
issues, but if you could run the test script attached to [1]/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi -- John Naylor EDB: http://www.enterprisedb.com on 64-bit Arm,
I'd be curious to see how it does on 0002, and whether 0003 and 0004 make
things better or worse. If there is trouble building on non-x86 platforms,
I'd want to fix that also.

(Note: 0001 is not my patch, and I just include it for the tests)

Can this be
done in a portable way by arranging for auto-vectorization ? Something
like commit 88709176236caf. This way it would benefit other platforms
as well.

I'm fairly certain that the author of a compiler capable of doing that in
this case would be eligible for some kind of AI prize. :-)

[1]: /messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi -- John Naylor EDB: http://www.enterprisedb.com
/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi
--
John Naylor
EDB: http://www.enterprisedb.com

#21

Amit Khandekar

amitdkhan.pg@gmail.com

almost 5 years ago

In reply to: John Naylor (#20)

Re: [POC] verifying UTF-8 using SIMD instructions

On Tue, 9 Mar 2021 at 17:14, John Naylor <john.naylor@enterprisedb.com> wrote:

On Tue, Mar 9, 2021 at 5:00 AM Amit Khandekar <amitdkhan.pg@gmail.com> wrote:

Just a quick question before I move on to review the patch ... The
improvement looks like it is only meant for x86 platforms.

Actually it's meant to be faster for all platforms, since the C fallback is quite a bit different from HEAD. I've found it to be faster on ppc64le. An earlier version of the patch was a loser on 32-bit Arm because of alignment issues, but if you could run the test script attached to [1] on 64-bit Arm, I'd be curious to see how it does on 0002, and whether 0003 and 0004 make things better or worse. If there is trouble building on non-x86 platforms, I'd want to fix that also.

On my Arm64 VM :

HEAD :
mixed | ascii
-------+-------
1091 | 628
(1 row)

PATCHED :
mixed | ascii
-------+-------
681 | 119

So the fallback function does show improvements on Arm64.

I guess, if at all we use the equivalent Arm NEON intrinsics, the
"mixed" figures will be close to the "ascii" figures, going by your
figures on x86.

Can this be
done in a portable way by arranging for auto-vectorization ? Something
like commit 88709176236caf. This way it would benefit other platforms
as well.

I'm fairly certain that the author of a compiler capable of doing that in this case would be eligible for some kind of AI prize. :-)

I was not thinking about auto-vectorizing the code in
pg_validate_utf8_sse42(). Rather, I was considering auto-vectorization
inside the individual helper functions that you wrote, such as
_mm_setr_epi8(), shift_right(), bitwise_and(), prev1(), splat(),
saturating_sub() etc. I myself am not sure whether it is feasible to
write code that auto-vectorizes all these function definitions.
saturating_sub() seems hard, but I could see the gcc docs mentioning
support for generating such instructions for a particular code loop.
But for the index lookup function() it seems impossible to generate
the needed index lookup intrinsics. We can have platform-specific
function definitions for such exceptional cases.

I am considering this only because that would make the exact code work
on other platforms like arm64 and ppc, and won't have to have
platform-specific files. But I understand that it is easier said than
done. We will have to process the loop in pg_validate_utf8_sse42() in
128-bit chunks, and pass each chunk to individual functions, which
could mean extra work and extra copy in extracting the chunk data and
passing it around, which may make things drastically slow. You are
passing around the chunks using __m128i type, so perhaps it means
passing around just a reference to the simd registers. Not sure.

--
Thanks,
-Amit Khandekar
Huawei Technologies

#22

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Amit Khandekar (#21)

Re: [POC] verifying UTF-8 using SIMD instructions

On Fri, Mar 12, 2021 at 9:14 AM Amit Khandekar <amitdkhan.pg@gmail.com>
wrote:

On my Arm64 VM :

HEAD :
mixed | ascii
-------+-------
1091 | 628
(1 row)

PATCHED :
mixed | ascii
-------+-------
681 | 119

Thanks for testing! Good, the speedup is about as much as I can hope for
using plain C. In the next patch I'll go ahead and squash in the ascii fast
path, using 16-byte stride, unless there are objections. I claim we can
live with the regression Heikki found on an old 32-bit Arm platform since
it doesn't seem to be true of Arm in general.

I guess, if at all we use the equivalent Arm NEON intrinsics, the
"mixed" figures will be close to the "ascii" figures, going by your
figures on x86.

I would assume so.

I was not thinking about auto-vectorizing the code in
pg_validate_utf8_sse42(). Rather, I was considering auto-vectorization
inside the individual helper functions that you wrote, such as
_mm_setr_epi8(), shift_right(), bitwise_and(), prev1(), splat(),

If the PhD holders who came up with this algorithm thought it possible to
do it that way, I'm sure they would have. In reality, simdjson has
different files for SSE4, AVX, AVX512, NEON, and Altivec. We can
incorporate any of those as needed. That's a PG15 project, though, and I'm
not volunteering.

--
John Naylor
EDB: http://www.enterprisedb.com

#23

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#22)

2 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

I wrote:

Thanks for testing! Good, the speedup is about as much as I can hope for

using plain C. In the next patch I'll go ahead and squash in the ascii fast
path, using 16-byte stride, unless there are objections. I claim we can
live with the regression Heikki found on an old 32-bit Arm platform since
it doesn't seem to be true of Arm in general.

In v8, I've squashed the 16-byte stride into 0002. I also removed the sole
holdout of hard-coded intrinsics, by putting _mm_setr_epi8 inside a
variadic macro, and also did some reordering of the one-line function
definitions. (As before, 0001 is not my patch, but parts of it are a
prerequisite to my regressions tests).

Over in [1]/messages/by-id/CAFBsxsEybzagsrmuoLsKYx417Sce9cgnM91nf8f9HKGLadixPg@mail.gmail.com -- John Naylor EDB: http://www.enterprisedb.com , I tested in-situ in a COPY FROM test and found a 10% speedup
with mixed ascii and multibyte in the copy code, i.e. with buffer and
storage taken completely out of the picture.

[1]: /messages/by-id/CAFBsxsEybzagsrmuoLsKYx417Sce9cgnM91nf8f9HKGLadixPg@mail.gmail.com -- John Naylor EDB: http://www.enterprisedb.com
/messages/by-id/CAFBsxsEybzagsrmuoLsKYx417Sce9cgnM91nf8f9HKGLadixPg@mail.gmail.com
--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v8-0001-Add-noError-argument-to-encoding-conversion-funct.patchapplication/octet-stream; name=v8-0001-Add-noError-argument-to-encoding-conversion-funct.patchDownload

From e25ce1ffecbf095a3551c3d58b8262c1dfaa7642 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Sun, 7 Feb 2021 17:10:12 +0200
Subject: [PATCH v8 1/2] Add 'noError' argument to encoding conversion
 functions.

With the 'noError' argument, you can try to convert a buffer without
knowing the character boundaries beforehand. The functions now need to
return the number of input bytes successfully converted.

This is is a backwards-incompatible change, if you have created a custom
encoding conversion with CREATE CONVERSION. This adds a check to
pg_upgrade for that, refusing the upgrade if there are any user-defined
encoding conversions.

Add regression tests for built-in encoding conversions. This doesn't cover
every conversion, but it covers all the internal functions in conv.c that
are used to implement the conversions.
---
 doc/src/sgml/ref/create_conversion.sgml       |   5 +-
 src/backend/commands/conversioncmds.c         |  30 +-
 src/backend/utils/error/elog.c                |   2 +
 src/backend/utils/mb/conv.c                   | 139 ++++-
 .../cyrillic_and_mic/cyrillic_and_mic.c       | 127 +++--
 .../euc2004_sjis2004/euc2004_sjis2004.c       |  94 +++-
 .../euc_cn_and_mic/euc_cn_and_mic.c           |  57 +-
 .../euc_jp_and_sjis/euc_jp_and_sjis.c         | 153 ++++--
 .../euc_kr_and_mic/euc_kr_and_mic.c           |  57 +-
 .../euc_tw_and_big5/euc_tw_and_big5.c         | 165 ++++--
 .../latin2_and_win1250/latin2_and_win1250.c   |  49 +-
 .../latin_and_mic/latin_and_mic.c             |  43 +-
 .../utf8_and_big5/utf8_and_big5.c             |  37 +-
 .../utf8_and_cyrillic/utf8_and_cyrillic.c     |  67 ++-
 .../utf8_and_euc2004/utf8_and_euc2004.c       |  37 +-
 .../utf8_and_euc_cn/utf8_and_euc_cn.c         |  37 +-
 .../utf8_and_euc_jp/utf8_and_euc_jp.c         |  37 +-
 .../utf8_and_euc_kr/utf8_and_euc_kr.c         |  37 +-
 .../utf8_and_euc_tw/utf8_and_euc_tw.c         |  37 +-
 .../utf8_and_gb18030/utf8_and_gb18030.c       |  37 +-
 .../utf8_and_gbk/utf8_and_gbk.c               |  37 +-
 .../utf8_and_iso8859/utf8_and_iso8859.c       |  43 +-
 .../utf8_and_iso8859_1/utf8_and_iso8859_1.c   |  35 +-
 .../utf8_and_johab/utf8_and_johab.c           |  37 +-
 .../utf8_and_sjis/utf8_and_sjis.c             |  37 +-
 .../utf8_and_sjis2004/utf8_and_sjis2004.c     |  37 +-
 .../utf8_and_uhc/utf8_and_uhc.c               |  37 +-
 .../utf8_and_win/utf8_and_win.c               |  43 +-
 src/backend/utils/mb/mbutils.c                |  76 ++-
 src/bin/pg_upgrade/check.c                    |  95 ++++
 src/include/catalog/pg_proc.dat               | 332 +++++------
 src/include/mb/pg_wchar.h                     |  35 +-
 src/test/regress/expected/conversion.out      | 519 ++++++++++++++++++
 src/test/regress/expected/opr_sanity.out      |   7 +-
 .../regress/input/create_function_1.source    |   4 +
 .../regress/output/create_function_1.source   |   3 +
 src/test/regress/regress.c                    | 134 +++++
 src/test/regress/sql/conversion.sql           | 185 +++++++
 src/test/regress/sql/opr_sanity.sql           |   7 +-
 39 files changed, 2322 insertions(+), 628 deletions(-)

diff --git a/doc/src/sgml/ref/create_conversion.sgml b/doc/src/sgml/ref/create_conversion.sgml
index e7700fecfc..f014a676c8 100644
--- a/doc/src/sgml/ref/create_conversion.sgml
+++ b/doc/src/sgml/ref/create_conversion.sgml
@@ -117,8 +117,9 @@ conv_proc(
     integer,  -- destination encoding ID
     cstring,  -- source string (null terminated C string)
     internal, -- destination (fill with a null terminated C string)
-    integer   -- source string length
-) RETURNS void;
+    integer,  -- source string length
+    boolean   -- if true, don't throw an error if conversion fails
+) RETURNS integer;
 </programlisting></para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c
index f7ff321de7..59e7300020 100644
--- a/src/backend/commands/conversioncmds.c
+++ b/src/backend/commands/conversioncmds.c
@@ -45,8 +45,9 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	const char *from_encoding_name = stmt->for_encoding_name;
 	const char *to_encoding_name = stmt->to_encoding_name;
 	List	   *func_name = stmt->func_name;
-	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID};
+	static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID, BOOLOID};
 	char		result[1];
+	Datum		funcresult;
 
 	/* Convert list of names to a name and namespace */
 	namespaceId = QualifiedNameGetCreationNamespace(stmt->conversion_name,
@@ -92,8 +93,8 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	funcoid = LookupFuncName(func_name, sizeof(funcargs) / sizeof(Oid),
 							 funcargs, false);
 
-	/* Check it returns VOID, else it's probably the wrong function */
-	if (get_func_rettype(funcoid) != VOIDOID)
+	/* Check it returns int4, else it's probably the wrong function */
+	if (get_func_rettype(funcoid) != INT4OID)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 				 errmsg("encoding conversion function %s must return type %s",
@@ -111,12 +112,23 @@ CreateConversionCommand(CreateConversionStmt *stmt)
 	 * string; the conversion function should throw an error if it can't
 	 * perform the requested conversion.
 	 */
-	OidFunctionCall5(funcoid,
-					 Int32GetDatum(from_encoding),
-					 Int32GetDatum(to_encoding),
-					 CStringGetDatum(""),
-					 CStringGetDatum(result),
-					 Int32GetDatum(0));
+	funcresult = OidFunctionCall6(funcoid,
+								  Int32GetDatum(from_encoding),
+								  Int32GetDatum(to_encoding),
+								  CStringGetDatum(""),
+								  CStringGetDatum(result),
+								  Int32GetDatum(0),
+								  BoolGetDatum(false));
+
+	/*
+	 * The function should return 0 for empty input. Might as well check that,
+	 * too.
+	 */
+	if (DatumGetInt32(funcresult) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("encoding conversion function %s returned incorrect result for empty input",
+						NameListToString(func_name))));
 
 	/*
 	 * All seem ok, go ahead (possible failure would be a duplicate conversion
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index e729ebece7..32227c1572 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2248,6 +2248,8 @@ write_console(const char *line, int len)
 	 * Conversion on non-win32 platforms is not implemented yet. It requires
 	 * non-throw version of pg_do_encoding_conversion(), that converts
 	 * unconvertable characters to '?' without errors.
+	 *
+	 * XXX: We have a no-throw version now. It doesn't convert to '?' though.
 	 */
 #endif
 
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a07b54bd3b..33e9c9a9e3 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -25,15 +25,20 @@
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
-			const unsigned char *tab)
+			const unsigned char *tab,
+			bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -41,7 +46,11 @@ local2local(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -50,13 +59,19 @@ local2local(const unsigned char *l,
 			if (c2)
 				*p++ = c2;
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -66,18 +81,26 @@ local2local(const unsigned char *l,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = l;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
@@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
+			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 
@@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l,
 				*p++ = c2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l,
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;
 
@@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic,
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
@@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic,
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
@@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt,
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
+	const unsigned char *start = utf;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len,
 
 			l = pg_utf_mblen(utf);
 			if (len < l)
+			{
+				/* need more data to decide if this is a combined char */
+				utf -= l_save;
 				break;
+			}
 
 			if (!pg_utf8_islegal(utf, l))
+			{
+				if (!noError)
+					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+				utf -= l_save;
 				break;
+			}
 
 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
@@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* failed to translate this character */
+		utf -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(PG_UTF8, encoding,
-								   (const char *) (utf - l), len);
+								   (const char *) utf, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 
 	*iso = '\0';
+
+	return utf - start;
 }
 
 /*
@@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len,
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding,
+		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
+	const unsigned char *start = iso;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len,
 		}
 
 		/* failed to translate this character */
+		iso -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(encoding, PG_UTF8,
-								   (const char *) (iso - l), len);
+								   (const char *) iso, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);
 
 	*utf = '\0';
+
+	return iso - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
index 4c5b02654d..368c2deb5e 100644
--- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
@@ -44,8 +44,11 @@ PG_FUNCTION_INFO_V1(win866_to_iso);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -306,12 +309,14 @@ koi8r_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -320,12 +325,14 @@ mic_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_KOI8R);
 
-	mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R);
+	converted = mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -334,12 +341,14 @@ iso_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -348,12 +357,14 @@ mic_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_ISO_8859_5);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -362,12 +373,14 @@ win1251_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -376,12 +389,14 @@ mic_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1251);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -390,12 +405,14 @@ win866_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi);
+	converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -404,12 +421,14 @@ mic_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN866);
 
-	mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866);
+	converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -418,12 +437,14 @@ koi8r_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN1251);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -432,12 +453,14 @@ win1251_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -446,12 +469,14 @@ koi8r_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN866);
 
-	local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -460,12 +485,14 @@ win866_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_KOI8R);
 
-	local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi);
+	converted = local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -474,12 +501,14 @@ win866_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_WIN1251);
 
-	local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251);
+	converted = local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -488,12 +517,14 @@ win1251_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_WIN866);
 
-	local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -502,12 +533,14 @@ iso_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_KOI8R);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -516,12 +549,14 @@ koi8r_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso);
+	converted = local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -530,12 +565,14 @@ iso_to_win1251(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN1251);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -544,12 +581,14 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso);
+	converted = local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -558,12 +597,14 @@ iso_to_win866(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN866);
 
-	local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866);
+	converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -572,10 +613,12 @@ win866_to_iso(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_ISO_8859_5);
 
-	local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso);
+	converted = local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
index 4d7fb116cf..a3fd35bd40 100644
--- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c
@@ -19,8 +19,8 @@ PG_MODULE_MAGIC;
 PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004);
 PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004);
 
-static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len);
-static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len);
+static int	euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError);
 
 /* ----------
  * conv_proc(
@@ -28,8 +28,11 @@ static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -39,12 +42,14 @@ euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004);
 
-	euc_jis_20042shift_jis_2004(src, dest, len);
+	converted = euc_jis_20042shift_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -53,20 +58,23 @@ shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004);
 
-	shift_jis_20042euc_jis_2004(src, dest, len);
+	converted = shift_jis_20042euc_jis_2004(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_JIS_2004 -> SHIFT_JIS_2004
  */
-static void
-euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				ku,
 				ten;
@@ -79,8 +87,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -90,8 +102,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 		l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
 
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		if (c1 == SS2 && l == 2)	/* JIS X 0201 kana? */
 		{
@@ -121,8 +137,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 						*p++ = (ku + 0x19b) >> 1;
 					}
 					else
+					{
+						if (noError)
+							break;
 						report_invalid_encoding(PG_EUC_JIS_2004,
 												(const char *) euc, len);
+					}
 			}
 
 			if (ku % 2)
@@ -132,8 +152,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
@@ -149,8 +173,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 			else if (ku >= 63 && ku <= 94)
 				*p++ = (ku + 0x181) >> 1;
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JIS_2004,
 										(const char *) euc, len);
+			}
 
 			if (ku % 2)
 			{
@@ -159,20 +187,30 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
 				else if (ten >= 64 && ten <= 94)
 					*p++ = ten + 0x40;
 				else
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_EUC_JIS_2004,
 											(const char *) euc, len);
+				}
 			}
 			else
 				*p++ = ten + 0x9e;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JIS_2004,
 									(const char *) euc, len);
+		}
 
 		euc += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
@@ -212,9 +250,10 @@ get_ten(int b, int *ku)
  * SHIFT_JIS_2004 ---> EUC_JIS_2004
  */
 
-static void
-shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len)
+static int
+shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1;
 	int			ku,
 				ten,
@@ -230,8 +269,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -241,8 +284,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
 
 		if (l < 0 || l > len)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SHIFT_JIS_2004,
 									(const char *) sjis, len);
+		}
 
 		if (c1 >= 0xa1 && c1 <= 0xdf && l == 1)
 		{
@@ -266,8 +313,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x100;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xe0 && c1 <= 0xef)	/* plane 1 62ku-94ku */
@@ -275,9 +326,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				ku = (c1 << 1) - 0x180;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
-
 											(const char *) sjis, len);
+				}
 				ku -= kubun;
 			}
 			else if (c1 >= 0xf0 && c1 <= 0xf3)	/* plane 2
@@ -286,8 +340,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				switch (c1)
 				{
 					case 0xf0:
@@ -309,16 +367,24 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 				plane = 2;
 				ten = get_ten(c2, &kubun);
 				if (ten < 0)
+				{
+					if (noError)
+						break;
 					report_invalid_encoding(PG_SHIFT_JIS_2004,
 											(const char *) sjis, len);
+				}
 				if (c1 == 0xf4 && kubun == 1)
 					ku = 15;
 				else
 					ku = (c1 << 1) - 0x19a - kubun;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SHIFT_JIS_2004,
 										(const char *) sjis, len);
+			}
 
 			if (plane == 2)
 				*p++ = SS3;
@@ -330,4 +396,6 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
index e9bb896935..09b3c2e75b 100644
--- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_cn_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_MULE_INTERNAL);
 
-	euc_cn2mic(src, dest, len);
+	converted = euc_cn2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_CN);
 
-	mic2euc_cn(src, dest, len);
+	converted = mic2euc_cn(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_CN ---> MIC
  */
-static void
-euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 
 	while (len > 0)
@@ -76,7 +84,11 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = LC_GB2312_80;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -86,21 +98,28 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_CN
  */
-static void
-mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
@@ -109,11 +128,19 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		if (IS_HIGHBIT_SET(c1))
 		{
 			if (c1 != LC_GB2312_80)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
 										   (const char *) mic, len);
+			}
 			if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			mic++;
 			*p++ = *mic++;
 			*p++ = *mic++;
@@ -122,12 +149,18 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
index 5059f917a9..2e68708893 100644
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -42,17 +42,20 @@ PG_FUNCTION_INFO_V1(mic_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static int	sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
+static int	mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -60,12 +63,14 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_SJIS);
 
-	euc_jp2sjis(src, dest, len);
+	converted = euc_jp2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -74,12 +79,14 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_EUC_JP);
 
-	sjis2euc_jp(src, dest, len);
+	converted = sjis2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -88,12 +95,14 @@ euc_jp_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_MULE_INTERNAL);
 
-	euc_jp2mic(src, dest, len);
+	converted = euc_jp2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -102,12 +111,14 @@ mic_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_JP);
 
-	mic2euc_jp(src, dest, len);
+	converted = mic2euc_jp(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -116,12 +127,14 @@ sjis_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_MULE_INTERNAL);
 
-	sjis2mic(src, dest, len);
+	converted = sjis2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -130,20 +143,23 @@ mic_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_SJIS);
 
-	mic2sjis(src, dest, len);
+	converted = mic2sjis(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * SJIS ---> MIC
  */
-static void
-sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -167,7 +183,11 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 			 * JIS X0208, X0212, user defined extended characters
 			 */
 			if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			c2 = sjis[1];
 			k = (c1 << 8) + c2;
 			if (k >= 0xed40 && k < 0xf040)
@@ -257,21 +277,28 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
 
 /*
  * MIC ---> SJIS
  */
-static void
-mic2sjis(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1,
 				c2,
 				k,
@@ -284,8 +311,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -293,8 +324,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 			*p++ = mic[1];
 		else if (c1 == LC_JISX0208)
@@ -350,20 +385,27 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 			}
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP ---> MIC
  */
-static void
-euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -374,8 +416,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -383,8 +429,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{						/* 1 byte kana? */
 			*p++ = LC_JISX0201K;
@@ -406,14 +456,17 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_JP
  */
-static void
-mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -424,8 +477,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -433,8 +490,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_JISX0201K)
 		{
 			*p++ = SS2;
@@ -452,20 +513,27 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * EUC_JP -> SJIS
  */
-static void
-euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1,
 				c2,
 				k;
@@ -478,8 +546,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_JP,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
@@ -487,8 +559,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_EUC_JP,
 									(const char *) euc, len);
+		}
 		if (c1 == SS2)
 		{
 			/* hankaku kana? */
@@ -551,14 +627,17 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * SJIS ---> EUC_JP
  */
-static void
-sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
+static int
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = sjis;
 	int			c1,
 				c2,
 				i,
@@ -573,8 +652,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_SJIS,
 										(const char *) sjis, len);
+			}
 			*p++ = c1;
 			sjis++;
 			len--;
@@ -582,8 +665,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_SJIS,
 									(const char *) sjis, len);
+		}
 		if (c1 >= 0xa1 && c1 <= 0xdf)
 		{
 			/* JIS X0201 (1 byte kana) */
@@ -680,4 +767,6 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 		len -= l;
 	}
 	*p = '\0';
+
+	return sjis - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
index ac823d6c27..3b85f0c186 100644
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -40,12 +43,14 @@ euc_kr_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_MULE_INTERNAL);
 
-	euc_kr2mic(src, dest, len);
+	converted = euc_kr2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -54,20 +59,23 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_KR);
 
-	mic2euc_kr(src, dest, len);
+	converted = mic2euc_kr(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 /*
  * EUC_KR ---> MIC
  */
-static void
-euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -78,8 +86,12 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = LC_KS5601;
 			*p++ = c1;
 			*p++ = euc[1];
@@ -89,22 +101,29 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_KR,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_KR
  */
-static void
-mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -115,8 +134,12 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -124,18 +147,28 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_KS5601)
 		{
 			*p++ = mic[1];
 			*p++ = mic[2];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
index 66c242d7f3..4bf8acda99 100644
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -32,17 +32,20 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
-static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
-static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
-static void big52mic(const unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
+static int	euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
+static int	mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
+static int	euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
+static int	mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -50,12 +53,14 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
 
-	euc_tw2big5(src, dest, len);
+	converted = euc_tw2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -64,12 +69,14 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
 
-	big52euc_tw(src, dest, len);
+	converted = big52euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -78,12 +85,14 @@ euc_tw_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
 
-	euc_tw2mic(src, dest, len);
+	converted = euc_tw2mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -92,12 +101,14 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
 
-	mic2euc_tw(src, dest, len);
+	converted = mic2euc_tw(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -106,12 +117,14 @@ big5_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
 
-	big52mic(src, dest, len);
+	converted = big52mic(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -120,21 +133,24 @@ mic_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
 
-	mic2big5(src, dest, len);
+	converted = mic2big5(src, dest, len, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 
 /*
  * EUC_TW ---> Big5
  */
-static void
-euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	unsigned char c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -149,8 +165,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Verify and decode the next EUC_TW input character */
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -171,8 +191,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 			/* Write it out in Big5 */
 			big5buf = CNStoBIG5(cnsBuf, lc);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_EUC_TW, PG_BIG5,
 										   (const char *) euc, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 
@@ -182,22 +206,29 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * Big5 ---> EUC_TW
  */
-static void
-big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -212,8 +243,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			big5buf = (c1 << 8) | big5[1];
 			cnsBuf = BIG5toCNS(big5buf, &lc);
 
@@ -237,8 +272,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 				*p++ = cnsBuf & 0x00ff;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_BIG5, PG_EUC_TW,
 										   (const char *) big5, len);
+			}
 
 			big5 += l;
 			len -= l;
@@ -256,14 +295,17 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
 		}
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * EUC_TW ---> MIC
  */
-static void
-euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
+static int
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = euc;
 	int			c1;
 	int			l;
 
@@ -274,8 +316,12 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		{
 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
 			if (l < 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			if (c1 == SS2)
 			{
 				c1 = euc[1];	/* plane No. */
@@ -304,22 +350,29 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 		else
 		{						/* should be ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_EUC_TW,
 										(const char *) euc, len);
+			}
 			*p++ = c1;
 			euc++;
 			len--;
 		}
 	}
 	*p = '\0';
+
+	return euc - start;
 }
 
 /*
  * MIC ---> EUC_TW
  */
-static void
-mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 	int			l;
 
@@ -330,8 +383,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -339,8 +396,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1)
 		{
 			*p++ = mic[1];
@@ -362,20 +423,27 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 			*p++ = mic[3];
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
  * Big5 ---> MIC
  */
-static void
-big52mic(const unsigned char *big5, unsigned char *p, int len)
+static int
+big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = big5;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -389,8 +457,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_BIG5,
 										(const char *) big5, len);
+			}
 			*p++ = c1;
 			big5++;
 			len--;
@@ -398,8 +470,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_BIG5,
 									(const char *) big5, len);
+		}
 		big5buf = (c1 << 8) | big5[1];
 		cnsBuf = BIG5toCNS(big5buf, &lc);
 		if (lc != 0)
@@ -412,20 +488,27 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
 			*p++ = cnsBuf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
 									   (const char *) big5, len);
+		}
 		big5 += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return big5 - start;
 }
 
 /*
  * MIC ---> Big5
  */
-static void
-mic2big5(const unsigned char *mic, unsigned char *p, int len)
+static int
+mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned short c1;
 	unsigned short big5buf,
 				cnsBuf;
@@ -438,8 +521,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		{
 			/* ASCII */
 			if (c1 == 0)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL,
 										(const char *) mic, len);
+			}
 			*p++ = c1;
 			mic++;
 			len--;
@@ -447,8 +534,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 		}
 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
 		if (l < 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL,
 									(const char *) mic, len);
+		}
 		if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
 		{
 			if (c1 == LCPRV2_B)
@@ -462,16 +553,26 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
 			}
 			big5buf = CNStoBIG5(cnsBuf, c1);
 			if (big5buf == 0)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 										   (const char *) mic, len);
+			}
 			*p++ = (big5buf >> 8) & 0x00ff;
 			*p++ = big5buf & 0x00ff;
 		}
 		else
+		{
+			if (noError)
+				break;
 			report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
 									   (const char *) mic, len);
+		}
 		mic += l;
 		len -= l;
 	}
 	*p = '\0';
+
+	return mic - start;
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
index 2e28e6780a..8610fcb69a 100644
--- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
+++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(win1250_to_latin2);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -82,12 +85,14 @@ latin2_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -96,12 +101,14 @@ mic_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN2);
 
-	mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2);
+	converted = mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -110,13 +117,15 @@ win1250_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_MULE_INTERNAL);
 
-	latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 win1250_2_iso88592);
+	converted = latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -125,13 +134,15 @@ mic_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1250);
 
-	mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
-						 iso88592_2_win1250);
+	converted = mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250,
+									 iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -140,12 +151,15 @@ latin2_to_win1250(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_WIN1250);
 
-	local2local(src, dest, len, PG_LATIN2, PG_WIN1250, iso88592_2_win1250);
+	converted = local2local(src, dest, len, PG_LATIN2, PG_WIN1250,
+							iso88592_2_win1250, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -154,10 +168,13 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_LATIN2);
 
-	local2local(src, dest, len, PG_WIN1250, PG_LATIN2, win1250_2_iso88592);
+	converted = local2local(src, dest, len, PG_WIN1250, PG_LATIN2,
+							win1250_2_iso88592, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
index bc651410f2..bff27d1c29 100644
--- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
@@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(mic_to_latin4);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -42,12 +45,14 @@ latin1_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,12 +61,14 @@ mic_to_latin1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN1);
 
-	mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1);
+	converted = mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -70,12 +77,14 @@ latin3_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN3, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -84,12 +93,14 @@ mic_to_latin3(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN3);
 
-	mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3);
+	converted = mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,12 +109,14 @@ latin4_to_mic(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN4, PG_MULE_INTERNAL);
 
-	latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -112,10 +125,12 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN4);
 
-	mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4);
+	converted = mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
index d6067cdc24..3838b15cab 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_big5);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ big5_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &big5_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = LocalToUtf(src, len, dest,
+						   &big5_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_big5(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5);
 
-	UtfToLocal(src, len, dest,
-			   &big5_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_BIG5);
+	converted = UtfToLocal(src, len, dest,
+						   &big5_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_BIG5,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
index ed90e8e682..75719fe5f1 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
@@ -33,8 +33,11 @@ PG_FUNCTION_INFO_V1(koi8u_to_utf8);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -44,16 +47,19 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R);
 
-	UtfToLocal(src, len, dest,
-			   &koi8r_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8r_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -62,16 +68,19 @@ koi8r_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8r_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8R);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8r_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8R,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -80,16 +89,19 @@ utf8_to_koi8u(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U);
 
-	UtfToLocal(src, len, dest,
-			   &koi8u_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = UtfToLocal(src, len, dest,
+						   &koi8u_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -98,14 +110,17 @@ koi8u_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &koi8u_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_KOI8U);
+	converted = LocalToUtf(src, len, dest,
+						   &koi8u_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_KOI8U,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
index d699affce4..5391001951 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jis_2004_to_unicode_tree,
-			   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jis_2004_to_unicode_tree,
+						   LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jis_2004_from_unicode_tree,
-			   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
-			   NULL,
-			   PG_EUC_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jis_2004_from_unicode_tree,
+						   ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined),
+						   NULL,
+						   PG_EUC_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
index d7c0ba6a58..c87d1bf239 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_cn);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_cn_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_cn_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN);
 
-	UtfToLocal(src, len, dest,
-			   &euc_cn_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_CN);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_cn_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_CN,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
index 13a3a23e77..6a55134db2 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jp);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_jp_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_jp_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP);
 
-	UtfToLocal(src, len, dest,
-			   &euc_jp_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_JP);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_jp_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_JP,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
index 1bbb8aaef7..fe1924e2fe 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_kr);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_kr_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_kr_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR);
 
-	UtfToLocal(src, len, dest,
-			   &euc_kr_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_KR);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_kr_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_KR,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
index 9830045dcc..68215659b5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_tw);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &euc_tw_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = LocalToUtf(src, len, dest,
+						   &euc_tw_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW);
 
-	UtfToLocal(src, len, dest,
-			   &euc_tw_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_EUC_TW);
+	converted = UtfToLocal(src, len, dest,
+						   &euc_tw_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_EUC_TW,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index f86ecf2742..e1a59c39a4 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -183,8 +183,11 @@ conv_utf8_to_18030(uint32 code)
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -193,16 +196,19 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gb18030_to_unicode_tree,
-			   NULL, 0,
-			   conv_18030_to_utf8,
-			   PG_GB18030);
+	converted = LocalToUtf(src, len, dest,
+						   &gb18030_to_unicode_tree,
+						   NULL, 0,
+						   conv_18030_to_utf8,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -211,14 +217,17 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, len, dest,
-			   &gb18030_from_unicode_tree,
-			   NULL, 0,
-			   conv_utf8_to_18030,
-			   PG_GB18030);
+	converted = UtfToLocal(src, len, dest,
+						   &gb18030_from_unicode_tree,
+						   NULL, 0,
+						   conv_utf8_to_18030,
+						   PG_GB18030,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
index 2ab8b16c8a..881386d534 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_gbk);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ gbk_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &gbk_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = LocalToUtf(src, len, dest,
+						   &gbk_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK);
 
-	UtfToLocal(src, len, dest,
-			   &gbk_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_GBK);
+	converted = UtfToLocal(src, len, dest,
+						   &gbk_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_GBK,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
index 3e49f67ea2..d93a521bad 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
@@ -52,8 +52,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -100,6 +103,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -108,12 +112,15 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -122,7 +129,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -132,6 +139,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -140,12 +148,15 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -154,5 +165,5 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for ISO 8859 character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
index 67e713cca1..d0dc4cca37 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
@@ -26,8 +26,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -37,6 +40,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
@@ -45,7 +50,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+		}
 		if (!IS_HIGHBIT_SET(c))
 			*dest++ = c;
 		else
@@ -58,7 +67,7 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
 
 Datum
@@ -67,6 +76,8 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	unsigned char *start = src;
 	unsigned short c,
 				c1;
 
@@ -76,7 +87,11 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 	{
 		c = *src;
 		if (c == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_UTF8, (const char *) src, len);
+		}
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(c))
 		{
@@ -89,10 +104,18 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 			int			l = pg_utf_mblen(src);
 
 			if (l > len || !pg_utf8_islegal(src, l))
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_UTF8, (const char *) src, len);
+			}
 			if (l != 2)
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 			c1 = src[1] & 0x3f;
 			c = ((c & 0x1f) << 6) | c1;
 			if (c >= 0x80 && c <= 0xff)
@@ -102,11 +125,15 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
 				len -= 2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_UTF8, PG_LATIN1,
 										   (const char *) src, len);
+			}
 		}
 	}
 	*dest = '\0';
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(src - start);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
index 578f5df4e7..317daa2d5e 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_johab);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ johab_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &johab_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = LocalToUtf(src, len, dest,
+						   &johab_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_johab(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB);
 
-	UtfToLocal(src, len, dest,
-			   &johab_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_JOHAB);
+	converted = UtfToLocal(src, len, dest,
+						   &johab_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_JOHAB,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
index dd9fc2975a..4c9348aba5 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_sjis);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ sjis_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &sjis_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = LocalToUtf(src, len, dest,
+						   &sjis_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS);
 
-	UtfToLocal(src, len, dest,
-			   &sjis_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_SJIS);
+	converted = UtfToLocal(src, len, dest,
+						   &sjis_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_SJIS,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
index 4bcc886d67..1fffdc5930 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_shift_jis_2004);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &shift_jis_2004_to_unicode_tree,
-			   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = LocalToUtf(src, len, dest,
+						   &shift_jis_2004_to_unicode_tree,
+						   LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004);
 
-	UtfToLocal(src, len, dest,
-			   &shift_jis_2004_from_unicode_tree,
-			   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
-			   NULL,
-			   PG_SHIFT_JIS_2004);
+	converted = UtfToLocal(src, len, dest,
+						   &shift_jis_2004_from_unicode_tree,
+						   ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined),
+						   NULL,
+						   PG_SHIFT_JIS_2004,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
index c8e512994a..d9471dad09 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
@@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_uhc);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 Datum
@@ -38,16 +41,19 @@ uhc_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8);
 
-	LocalToUtf(src, len, dest,
-			   &uhc_to_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = LocalToUtf(src, len, dest,
+						   &uhc_to_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
 
 Datum
@@ -56,14 +62,17 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
+	int			converted;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC);
 
-	UtfToLocal(src, len, dest,
-			   &uhc_from_unicode_tree,
-			   NULL, 0,
-			   NULL,
-			   PG_UHC);
+	converted = UtfToLocal(src, len, dest,
+						   &uhc_from_unicode_tree,
+						   NULL, 0,
+						   NULL,
+						   PG_UHC,
+						   noError);
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(converted);
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
index 0c9493dee5..110ba5677d 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
@@ -48,8 +48,11 @@ PG_FUNCTION_INFO_V1(utf8_to_win);
  *		INTEGER,	-- destination encoding id
  *		CSTRING,	-- source string (null terminated C string)
  *		CSTRING,	-- destination string (null terminated C string)
- *		INTEGER		-- source string length
- * ) returns VOID;
+ *		INTEGER,	-- source string length
+ *		BOOL		-- if true, don't throw an error if conversion fails
+ * ) returns INTEGER;
+ *
+ * Returns the number of bytes successfully converted.
  * ----------
  */
 
@@ -81,6 +84,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8);
@@ -89,12 +93,15 @@ win_to_utf8(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			LocalToUtf(src, len, dest,
-					   maps[i].map1,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = LocalToUtf(src, len, dest,
+								   maps[i].map1,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -103,7 +110,7 @@ win_to_utf8(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
 
 Datum
@@ -113,6 +120,7 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	bool		noError = PG_GETARG_BOOL(5);
 	int			i;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1);
@@ -121,12 +129,15 @@ utf8_to_win(PG_FUNCTION_ARGS)
 	{
 		if (encoding == maps[i].encoding)
 		{
-			UtfToLocal(src, len, dest,
-					   maps[i].map2,
-					   NULL, 0,
-					   NULL,
-					   encoding);
-			PG_RETURN_VOID();
+			int			converted;
+
+			converted = UtfToLocal(src, len, dest,
+								   maps[i].map2,
+								   NULL, 0,
+								   NULL,
+								   encoding,
+								   noError);
+			PG_RETURN_INT32(converted);
 		}
 	}
 
@@ -135,5 +146,5 @@ utf8_to_win(PG_FUNCTION_ARGS)
 			 errmsg("unexpected encoding ID %d for WIN character sets",
 					encoding)));
 
-	PG_RETURN_VOID();
+	PG_RETURN_INT32(0);
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 2578573b0a..877d57eee5 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	OidFunctionCall5(proc,
-					 Int32GetDatum(src_encoding),
-					 Int32GetDatum(dest_encoding),
-					 CStringGetDatum(src),
-					 CStringGetDatum(result),
-					 Int32GetDatum(len));
+	(void) OidFunctionCall6(proc,
+							Int32GetDatum(src_encoding),
+							Int32GetDatum(dest_encoding),
+							CStringGetDatum(src),
+							CStringGetDatum(result),
+							Int32GetDatum(len),
+							BoolGetDatum(false));
 
 	/*
 	 * If the result is large, it's worth repalloc'ing to release any extra
@@ -435,6 +436,59 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 	return result;
 }
 
+/*
+ * Convert src string to another encoding.
+ *
+ * This function has a different API than the other conversion functions.
+ * The caller should've looked up the conversion function using
+ * FindDefaultConversionProc(). Unlike the other functions, the converted
+ * result is not palloc'd. It is written to a caller-supplied buffer instead.
+ *
+ * src_encoding   - encoding to convert from
+ * dest_encoding  - encoding to convert to
+ * src, srclen    - input buffer and its length in bytes
+ * dest, destlen  - destination buffer and its size in bytes
+ *
+ * The output is null-terminated.
+ *
+ * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
+ * wouldn't necessarily fit in the output buffer, and the function will not
+ * convert the whole input.
+ *
+ * TODO: It would be nice to also return the number of bytes written to the
+ * caller, to avoid a call to strlen().
+ */
+int
+pg_do_encoding_conversion_buf(Oid proc,
+							  int src_encoding,
+							  int dest_encoding,
+							  unsigned char *src, int srclen,
+							  unsigned char *dest, int destlen,
+							  bool noError)
+{
+	Datum		result;
+
+	/*
+	 * If the destination buffer is not large enough to hold the result in the
+	 * worst case, limit the input size passed to the conversion function.
+	 *
+	 * TODO: It would perhaps be more efficient to pass the destination buffer
+	 * size to the conversion function, so that if the conversion expands less
+	 * than the worst case, it could continue to fill up the whole buffer.
+	 */
+	if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
+		srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
+
+	result = OidFunctionCall6(proc,
+							  Int32GetDatum(src_encoding),
+							  Int32GetDatum(dest_encoding),
+							  CStringGetDatum(src),
+							  CStringGetDatum(dest),
+							  Int32GetDatum(srclen),
+							  BoolGetDatum(noError));
+	return DatumGetInt32(result);
+}
+
 /*
  * Convert string to encoding encoding_name. The source
  * encoding is the DB encoding.
@@ -762,12 +816,13 @@ perform_default_encoding_conversion(const char *src, int len,
 		MemoryContextAllocHuge(CurrentMemoryContext,
 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
 
-	FunctionCall5(flinfo,
+	FunctionCall6(flinfo,
 				  Int32GetDatum(src_encoding),
 				  Int32GetDatum(dest_encoding),
 				  CStringGetDatum(src),
 				  CStringGetDatum(result),
-				  Int32GetDatum(len));
+				  Int32GetDatum(len),
+				  BoolGetDatum(false));
 
 	/*
 	 * Release extra space if there might be a lot --- see comments in
@@ -849,12 +904,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
 	c_as_utf8[c_as_utf8_len] = '\0';
 
 	/* Convert, or throw error if we can't */
-	FunctionCall5(Utf8ToServerConvProc,
+	FunctionCall6(Utf8ToServerConvProc,
 				  Int32GetDatum(PG_UTF8),
 				  Int32GetDatum(server_encoding),
 				  CStringGetDatum(c_as_utf8),
 				  CStringGetDatum(s),
-				  Int32GetDatum(c_as_utf8_len));
+				  Int32GetDatum(c_as_utf8_len),
+				  BoolGetDatum(false));
 }
 
 
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 43fc297eb6..ee6be95b08 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -28,6 +28,7 @@ static void check_for_reg_data_type_usage(ClusterInfo *cluster);
 static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(ClusterInfo *new_cluster);
+static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
 static char *get_canonical_locale_name(int category, const char *locale);
 
 
@@ -102,6 +103,15 @@ check_and_dump_old_cluster(bool live_check)
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
 
+	/*
+	 * PG 14 changed the function signature of encoding conversion functions.
+	 * Conversions from older versions cannot be upgraded automatically
+	 * because the user-defined functions used by the encoding conversions
+	 * need to changed to match the new signature.
+	 */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1300)
+		check_for_user_defined_encoding_conversions(&old_cluster);
+
 	/*
 	 * Pre-PG 14 allowed user defined postfix operators, which are not
 	 * supported anymore.  Verify there are none, iff applicable.
@@ -1268,6 +1278,91 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
 	check_ok();
 }
 
+/*
+ * Verify that no user-defined encoding conversions exist.
+ */
+static void
+check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
+{
+	int			dbnum;
+	FILE	   *script = NULL;
+	bool		found = false;
+	char		output_path[MAXPGPATH];
+
+	prep_status("Checking for user-defined encoding conversions");
+
+	snprintf(output_path, sizeof(output_path),
+			 "encoding_conversions.txt");
+
+	/* Find any user defined encoding conversions */
+	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
+	{
+		PGresult   *res;
+		bool		db_used = false;
+		int			ntups;
+		int			rowno;
+		int			i_conoid,
+					i_conname,
+					i_nspname;
+		DbInfo	   *active_db = &cluster->dbarr.dbs[dbnum];
+		PGconn	   *conn = connectToServer(cluster, active_db->db_name);
+
+		/*
+		 * The query below hardcodes FirstNormalObjectId as 16384 rather than
+		 * interpolating that C #define into the query because, if that
+		 * #define is ever changed, the cutoff we want to use is the value
+		 * used by pre-version 14 servers, not that of some future version.
+		 */
+		res = executeQueryOrDie(conn,
+								"SELECT c.oid as conoid, c.conname, n.nspname "
+								"FROM pg_catalog.pg_conversion c, "
+								"     pg_catalog.pg_namespace n "
+								"WHERE c.connamespace = n.oid AND "
+								"      c.oid >= 16384");
+		ntups = PQntuples(res);
+		i_conoid = PQfnumber(res, "conoid");
+		i_conname = PQfnumber(res, "conname");
+		i_nspname = PQfnumber(res, "nspname");
+		for (rowno = 0; rowno < ntups; rowno++)
+		{
+			found = true;
+			if (script == NULL &&
+				(script = fopen_priv(output_path, "w")) == NULL)
+				pg_fatal("could not open file \"%s\": %s\n",
+						 output_path, strerror(errno));
+			if (!db_used)
+			{
+				fprintf(script, "In database: %s\n", active_db->db_name);
+				db_used = true;
+			}
+			fprintf(script, "  (oid=%s) %s.%s\n",
+					PQgetvalue(res, rowno, i_conoid),
+					PQgetvalue(res, rowno, i_nspname),
+					PQgetvalue(res, rowno, i_conname));
+		}
+
+		PQclear(res);
+
+		PQfinish(conn);
+	}
+
+	if (script)
+		fclose(script);
+
+	if (found)
+	{
+		pg_log(PG_REPORT, "fatal\n");
+		pg_fatal("Your installation contains user-defined encoding conversions.\n"
+				 "The conversion function parameters changed in PostgreSQL version 14\n"
+				 "so this cluster cannot currently be upgraded.  You can remove the\n"
+				 "encoding conversions in the old cluster and restart the upgrade.\n"
+				 "A list of user-defined encoding conversions is in the file:\n"
+				 "    %s\n\n", output_path);
+	}
+	else
+		check_ok();
+}
+
 
 /*
  * get_canonical_locale_name
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 93393fcfd4..aae56c84d6 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10794,388 +10794,388 @@
 # conversion functions
 { oid => '4302',
   descr => 'internal conversion function for KOI8R to MULE_INTERNAL',
-  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_mic',
+  proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4303',
   descr => 'internal conversion function for MULE_INTERNAL to KOI8R',
-  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_koi8r',
+  proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4304',
   descr => 'internal conversion function for ISO-8859-5 to MULE_INTERNAL',
-  proname => 'iso_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_mic',
+  proname => 'iso_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4305',
   descr => 'internal conversion function for MULE_INTERNAL to ISO-8859-5',
-  proname => 'mic_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_iso',
+  proname => 'mic_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4306',
   descr => 'internal conversion function for WIN1251 to MULE_INTERNAL',
-  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_mic',
+  proname => 'win1251_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4307',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1251',
-  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1251',
+  proname => 'mic_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4308',
   descr => 'internal conversion function for WIN866 to MULE_INTERNAL',
-  proname => 'win866_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_mic',
+  proname => 'win866_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_mic',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4309',
   descr => 'internal conversion function for MULE_INTERNAL to WIN866',
-  proname => 'mic_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win866',
+  proname => 'mic_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4310', descr => 'internal conversion function for KOI8R to WIN1251',
-  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'koi8r_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4311', descr => 'internal conversion function for WIN1251 to KOI8R',
-  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_koi8r', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4312', descr => 'internal conversion function for KOI8R to WIN866',
-  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_win866',
+  proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4313', descr => 'internal conversion function for WIN866 to KOI8R',
-  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_koi8r',
+  proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4314',
   descr => 'internal conversion function for WIN866 to WIN1251',
-  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win866_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win866_to_win1251', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4315',
   descr => 'internal conversion function for WIN1251 to WIN866',
-  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1251_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1251_to_win866', probin => '$libdir/cyrillic_and_mic' },
 { oid => '4316',
   descr => 'internal conversion function for ISO-8859-5 to KOI8R',
-  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_koi8r',
+  proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_koi8r',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4317',
   descr => 'internal conversion function for KOI8R to ISO-8859-5',
-  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_iso',
+  proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4318',
   descr => 'internal conversion function for ISO-8859-5 to WIN1251',
-  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win1251',
+  proname => 'iso_to_win1251', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win1251',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4319',
   descr => 'internal conversion function for WIN1251 to ISO-8859-5',
-  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_iso',
+  proname => 'win1251_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4320',
   descr => 'internal conversion function for ISO-8859-5 to WIN866',
-  proname => 'iso_to_win866', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win866',
+  proname => 'iso_to_win866', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win866',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4321',
   descr => 'internal conversion function for WIN866 to ISO-8859-5',
-  proname => 'win866_to_iso', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_iso',
+  proname => 'win866_to_iso', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_iso',
   probin => '$libdir/cyrillic_and_mic' },
 { oid => '4322',
   descr => 'internal conversion function for EUC_CN to MULE_INTERNAL',
-  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_mic',
+  proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_mic',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4323',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_CN',
-  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_cn',
+  proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_cn',
   probin => '$libdir/euc_cn_and_mic' },
 { oid => '4324', descr => 'internal conversion function for EUC_JP to SJIS',
-  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_sjis',
+  proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4325', descr => 'internal conversion function for SJIS to EUC_JP',
-  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_euc_jp',
+  proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4326',
   descr => 'internal conversion function for EUC_JP to MULE_INTERNAL',
-  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_mic',
+  proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4327',
   descr => 'internal conversion function for SJIS to MULE_INTERNAL',
-  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_mic',
+  proname => 'sjis_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_mic',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4328',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_JP',
-  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_jp',
+  proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_jp',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4329',
   descr => 'internal conversion function for MULE_INTERNAL to SJIS',
-  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_sjis',
+  proname => 'mic_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_sjis',
   probin => '$libdir/euc_jp_and_sjis' },
 { oid => '4330',
   descr => 'internal conversion function for EUC_KR to MULE_INTERNAL',
-  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_mic',
+  proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_mic',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4331',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_KR',
-  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_kr',
+  proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_kr',
   probin => '$libdir/euc_kr_and_mic' },
 { oid => '4332', descr => 'internal conversion function for EUC_TW to BIG5',
-  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_big5',
+  proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4333', descr => 'internal conversion function for BIG5 to EUC_TW',
-  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_euc_tw',
+  proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4334',
   descr => 'internal conversion function for EUC_TW to MULE_INTERNAL',
-  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_mic',
+  proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4335',
   descr => 'internal conversion function for BIG5 to MULE_INTERNAL',
-  proname => 'big5_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_mic',
+  proname => 'big5_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_mic',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4336',
   descr => 'internal conversion function for MULE_INTERNAL to EUC_TW',
-  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_tw',
+  proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_tw',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4337',
   descr => 'internal conversion function for MULE_INTERNAL to BIG5',
-  proname => 'mic_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_big5',
+  proname => 'mic_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_big5',
   probin => '$libdir/euc_tw_and_big5' },
 { oid => '4338',
   descr => 'internal conversion function for LATIN2 to MULE_INTERNAL',
-  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin2_to_mic',
+  proname => 'latin2_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4339',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN2',
-  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin2',
+  proname => 'mic_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin2',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4340',
   descr => 'internal conversion function for WIN1250 to MULE_INTERNAL',
-  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1250_to_mic',
+  proname => 'win1250_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_mic',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4341',
   descr => 'internal conversion function for MULE_INTERNAL to WIN1250',
-  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1250',
+  proname => 'mic_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1250',
   probin => '$libdir/latin2_and_win1250' },
 { oid => '4342',
   descr => 'internal conversion function for LATIN2 to WIN1250',
-  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'latin2_to_win1250', probin => '$libdir/latin2_and_win1250' },
 { oid => '4343',
   descr => 'internal conversion function for WIN1250 to LATIN2',
-  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'win1250_to_latin2', probin => '$libdir/latin2_and_win1250' },
 { oid => '4344',
   descr => 'internal conversion function for LATIN1 to MULE_INTERNAL',
-  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin1_to_mic',
+  proname => 'latin1_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin1_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4345',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN1',
-  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin1',
+  proname => 'mic_to_latin1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin1',
   probin => '$libdir/latin_and_mic' },
 { oid => '4346',
   descr => 'internal conversion function for LATIN3 to MULE_INTERNAL',
-  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin3_to_mic',
+  proname => 'latin3_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin3_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4347',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN3',
-  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin3',
+  proname => 'mic_to_latin3', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin3',
   probin => '$libdir/latin_and_mic' },
 { oid => '4348',
   descr => 'internal conversion function for LATIN4 to MULE_INTERNAL',
-  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin4_to_mic',
+  proname => 'latin4_to_mic', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin4_to_mic',
   probin => '$libdir/latin_and_mic' },
 { oid => '4349',
   descr => 'internal conversion function for MULE_INTERNAL to LATIN4',
-  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin4',
+  proname => 'mic_to_latin4', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin4',
   probin => '$libdir/latin_and_mic' },
 { oid => '4352', descr => 'internal conversion function for BIG5 to UTF8',
-  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_utf8',
+  proname => 'big5_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_utf8',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4353', descr => 'internal conversion function for UTF8 to BIG5',
-  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_big5',
+  proname => 'utf8_to_big5', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_big5',
   probin => '$libdir/utf8_and_big5' },
 { oid => '4354', descr => 'internal conversion function for UTF8 to KOI8R',
-  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8r',
+  proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8r',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4355', descr => 'internal conversion function for KOI8R to UTF8',
-  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_utf8',
+  proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4356', descr => 'internal conversion function for UTF8 to KOI8U',
-  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8u',
+  proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8u',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4357', descr => 'internal conversion function for KOI8U to UTF8',
-  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8u_to_utf8',
+  proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8u_to_utf8',
   probin => '$libdir/utf8_and_cyrillic' },
 { oid => '4358', descr => 'internal conversion function for UTF8 to WIN',
-  proname => 'utf8_to_win', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_win',
+  proname => 'utf8_to_win', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_win',
   probin => '$libdir/utf8_and_win' },
 { oid => '4359', descr => 'internal conversion function for WIN to UTF8',
-  proname => 'win_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win_to_utf8',
+  proname => 'win_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win_to_utf8',
   probin => '$libdir/utf8_and_win' },
 { oid => '4360', descr => 'internal conversion function for EUC_CN to UTF8',
-  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_utf8',
+  proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_utf8',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4361', descr => 'internal conversion function for UTF8 to EUC_CN',
-  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_cn',
+  proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_cn',
   probin => '$libdir/utf8_and_euc_cn' },
 { oid => '4362', descr => 'internal conversion function for EUC_JP to UTF8',
-  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_utf8',
+  proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_utf8',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4363', descr => 'internal conversion function for UTF8 to EUC_JP',
-  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_jp',
+  proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jp',
   probin => '$libdir/utf8_and_euc_jp' },
 { oid => '4364', descr => 'internal conversion function for EUC_KR to UTF8',
-  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_utf8',
+  proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_utf8',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4365', descr => 'internal conversion function for UTF8 to EUC_KR',
-  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_kr',
+  proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_kr',
   probin => '$libdir/utf8_and_euc_kr' },
 { oid => '4366', descr => 'internal conversion function for EUC_TW to UTF8',
-  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_utf8',
+  proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_utf8',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4367', descr => 'internal conversion function for UTF8 to EUC_TW',
-  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_tw',
+  proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_tw',
   probin => '$libdir/utf8_and_euc_tw' },
 { oid => '4368', descr => 'internal conversion function for GB18030 to UTF8',
-  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gb18030_to_utf8',
+  proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gb18030_to_utf8',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4369', descr => 'internal conversion function for UTF8 to GB18030',
-  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gb18030',
+  proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gb18030',
   probin => '$libdir/utf8_and_gb18030' },
 { oid => '4370', descr => 'internal conversion function for GBK to UTF8',
-  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gbk_to_utf8',
+  proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gbk_to_utf8',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4371', descr => 'internal conversion function for UTF8 to GBK',
-  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gbk',
+  proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gbk',
   probin => '$libdir/utf8_and_gbk' },
 { oid => '4372',
   descr => 'internal conversion function for UTF8 to ISO-8859 2-16',
-  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_iso8859',
+  proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4373',
   descr => 'internal conversion function for ISO-8859 2-16 to UTF8',
-  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso8859_to_utf8',
+  proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_to_utf8',
   probin => '$libdir/utf8_and_iso8859' },
 { oid => '4374', descr => 'internal conversion function for LATIN1 to UTF8',
-  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'iso8859_1_to_utf8', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4375', descr => 'internal conversion function for UTF8 to LATIN1',
-  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_iso8859_1', probin => '$libdir/utf8_and_iso8859_1' },
 { oid => '4376', descr => 'internal conversion function for JOHAB to UTF8',
-  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'johab_to_utf8',
+  proname => 'johab_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'johab_to_utf8',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4377', descr => 'internal conversion function for UTF8 to JOHAB',
-  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_johab',
+  proname => 'utf8_to_johab', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_johab',
   probin => '$libdir/utf8_and_johab' },
 { oid => '4378', descr => 'internal conversion function for SJIS to UTF8',
-  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_utf8',
+  proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_utf8',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4379', descr => 'internal conversion function for UTF8 to SJIS',
-  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_sjis',
+  proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_sjis',
   probin => '$libdir/utf8_and_sjis' },
 { oid => '4380', descr => 'internal conversion function for UHC to UTF8',
-  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'uhc_to_utf8',
+  proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'uhc_to_utf8',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4381', descr => 'internal conversion function for UTF8 to UHC',
-  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_uhc',
+  proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_uhc',
   probin => '$libdir/utf8_and_uhc' },
 { oid => '4382',
   descr => 'internal conversion function for EUC_JIS_2004 to UTF8',
-  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_utf8', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4383',
   descr => 'internal conversion function for UTF8 to EUC_JIS_2004',
-  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_euc_jis_2004', probin => '$libdir/utf8_and_euc2004' },
 { oid => '4384',
   descr => 'internal conversion function for SHIFT_JIS_2004 to UTF8',
-  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_utf8', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4385',
   descr => 'internal conversion function for UTF8 to SHIFT_JIS_2004',
-  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'void',
-  proargtypes => 'int4 int4 cstring internal int4',
+  proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'int4',
+  proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'utf8_to_shift_jis_2004', probin => '$libdir/utf8_and_sjis2004' },
 { oid => '4386',
   descr => 'internal conversion function for EUC_JIS_2004 to SHIFT_JIS_2004',
   proname => 'euc_jis_2004_to_shift_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'euc_jis_2004_to_shift_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 { oid => '4387',
   descr => 'internal conversion function for SHIFT_JIS_2004 to EUC_JIS_2004',
   proname => 'shift_jis_2004_to_euc_jis_2004', prolang => 'c',
-  prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4',
+  prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool',
   prosrc => 'shift_jis_2004_to_euc_jis_2004',
   probin => '$libdir/euc2004_sjis2004' },
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 64b22e4b0d..bbce9071df 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -616,6 +616,12 @@ extern int	pg_bind_textdomain_codeset(const char *domainname);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 												int src_encoding,
 												int dest_encoding);
+extern int	pg_do_encoding_conversion_buf(Oid proc,
+										  int src_encoding,
+										  int dest_encoding,
+										  unsigned char *src, int srclen,
+										  unsigned char *dst, int dstlen,
+										  bool noError);
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
@@ -627,18 +633,18 @@ extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void UtfToLocal(const unsigned char *utf, int len,
+extern int	UtfToLocal(const unsigned char *utf, int len,
 					   unsigned char *iso,
 					   const pg_mb_radix_tree *map,
 					   const pg_utf_to_local_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
-extern void LocalToUtf(const unsigned char *iso, int len,
+					   int encoding, bool noError);
+extern int	LocalToUtf(const unsigned char *iso, int len,
 					   unsigned char *utf,
 					   const pg_mb_radix_tree *map,
 					   const pg_local_to_utf_combined *cmap, int cmapsize,
 					   utf_local_conversion_func conv_func,
-					   int encoding);
+					   int encoding, bool noError);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
@@ -656,18 +662,19 @@ extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg
 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
 									   const char *mbstr, int len) pg_attribute_noreturn();
 
-extern void local2local(const unsigned char *l, unsigned char *p, int len,
-						int src_encoding, int dest_encoding, const unsigned char *tab);
-extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
-					  int lc, int encoding);
-extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+extern int	local2local(const unsigned char *l, unsigned char *p, int len,
+						int src_encoding, int dest_encoding, const unsigned char *tab,
+						bool noError);
+extern int	latin2mic(const unsigned char *l, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	mic2latin(const unsigned char *mic, unsigned char *p, int len,
+					  int lc, int encoding, bool noError);
+extern int	latin2mic_with_table(const unsigned char *l, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
-extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+								 const unsigned char *tab, bool noError);
+extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 								 int len, int lc, int encoding,
-								 const unsigned char *tab);
+								 const unsigned char *tab, bool noError);
 
 #ifdef WIN32
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 62c1067168..e34ab20974 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -37,3 +37,522 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+--
+-- Test built-in conversion functions.
+--
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+                     description                      |        result        |   errorat    |                           error                           
+------------------------------------------------------+----------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382ab             | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
+ invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+                     description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
+------------------------------------------------------+----------------------+----------------+----------------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f       |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xa9daa9ec     |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0 |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fbedd   |                      | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5f7         |                      | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab         |                      | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x             | \xe382abe382         | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x             | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                 error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+-------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \xe4f6   |                      | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \x       | \xd184d0bed0be       | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+                     description                      |       inbytes        |  result  |       errorat        |                                                error                                                 
+------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f |                      | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x       | \xc3a4c3b6           | character with byte sequence 0xc3 0xa4 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, cyrillic                                      | \xd184d0bed0be       | \xc6cfcf |                      | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6f | \xe8b1a1             | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \x       | \xe382abe3829a       | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \x       | \xe382ab             | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \x       | \xe382abe382         | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x       | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+                     description                      |       inbytes        |           result           |   errorat    |                           error                           
+------------------------------------------------------+----------------------+----------------------------+--------------+-----------------------------------------------------------
+ valid, pure ASCII                                    | \x666f6f             | \x666f6f                   |              | 
+ valid, extra latin chars                             | \xc3a4c3b6           | \x81308a3181308b32         |              | 
+ valid, cyrillic                                      | \xd184d0bed0be       | \xa7e6a7e0a7e0             |              | 
+ valid, kanji/Chinese                                 | \x666f6fe8b1a1       | \x666f6fcff3               |              | 
+ valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a       | \xa5ab8139a732             |              | 
+ only first half of combined char in EUC_JIS_2004     | \xe382ab             | \xa5ab                     |              | 
+ incomplete combination when converted EUC_JIS_2004   | \xe382abe382         | \xa5ab                     | \xe382       | invalid byte sequence for encoding "UTF8": 0xe3 0x82
+ valid, Hangul, Korean                                | \xecbd94eb81bceba6ac | \x8334e5398238c4338330b335 |              | 
+ valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
+ invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
+ invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
+(13 rows)
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                            error                             
+---------------------------------------+----------------+--------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fbedd |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xa5f7       |          | 
+ incomplete char                       | \xbeddbe       | \xbedd       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f     | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f     | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x           | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+              description              |    inbytes     |     result     | errorat  |                            error                             
+---------------------------------------+----------------+----------------+----------+--------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f       |          | 
+ valid                                 | \x666f6fbedd   | \x666f6fe8b1a1 |          | 
+ valid, translates to two UTF-8 chars  | \xa5f7         | \xe382abe3829a |          | 
+ incomplete char                       | \xbeddbe       | \xe8b1a1       | \xbe     | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe
+ invalid, NUL byte                     | \x666f6f00bedd | \x666f6f       | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6fbe00dd | \x666f6f       | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00
+ invalid, NUL byte                     | \x666f6fbedd00 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "EUC_JIS_2004": 0x00
+ invalid byte sequence                 | \xbe04         | \x             | \xbe04   | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04
+(8 rows)
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6f8fdb |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6f81c0 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6f82f5 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6f8fdb | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6f8fdb | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+              description              |    inbytes     |        result        | errorat  |                             error                              
+---------------------------------------+----------------+----------------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f             |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fe8b1a1       |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fe28a84       |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fe3818be3829a |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fe8b1a1       | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f             | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f             | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f             | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fe8b1a1       | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+              description              |    inbytes     |    result    | errorat  |                             error                              
+---------------------------------------+----------------+--------------+----------+----------------------------------------------------------------
+ valid, pure ASCII                     | \x666f6f       | \x666f6f     |          | 
+ valid                                 | \x666f6f8fdb   | \x666f6fbedd |          | 
+ valid, no translation to UTF-8        | \x666f6f81c0   | \x666f6fa2c2 |          | 
+ valid, translates to two UTF-8 chars  | \x666f6f82f5   | \x666f6fa4f7 |          | 
+ incomplete char                       | \x666f6f8fdb8f | \x666f6fbedd | \x8f     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f
+ incomplete char, followed by newline  | \x666f6f820a   | \x666f6f     | \x820a   | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a
+ invalid, NUL byte                     | \x666f6f008fdb | \x666f6f     | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+ invalid, NUL byte                     | \x666f6f8f00db | \x666f6f     | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00
+ invalid, NUL byte                     | \x666f6f8fdb00 | \x666f6fbedd | \x00     | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00
+(9 rows)
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+                  description                   |      inbytes       |      result      |   errorat    |                               error                               
+------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fcff3     |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f8431a530 |              | 
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+                  description                   |      inbytes       |     result     |   errorat    |                                                    error                                                    
+------------------------------------------------+--------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII                              | \x666f6f           | \x666f6f       |              | 
+ valid                                          | \x666f6fcff3       | \x666f6fe8b1a1 |              | 
+ valid, no translation to UTF-8                 | \x666f6f8431a530   | \x666f6f       | \x8431a530   | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8"
+ valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
+ incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
+ incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
+ invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
+(8 rows)
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xe4dede |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xe4     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xe4dede | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \xd184d0bed0be |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xd184         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xd184d0bed0be | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+    description    |  inbytes   |  result  | errorat  |                         error                         
+-------------------+------------+----------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f |          | 
+ valid             | \xe4dede   | \xc6cfcf |          | 
+ invalid, NUL byte | \x00       | \x       | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \xc6     | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \xc6cfcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+    description    |  inbytes   |     result     | errorat  |                         error                         
+-------------------+------------+----------------+----------+-------------------------------------------------------
+ valid, pure ASCII | \x666f6f   | \x666f6f       |          | 
+ valid             | \xe4dede   | \x8bc68bcf8bcf |          | 
+ invalid, NUL byte | \x00       | \x             | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe400dede | \x8bc6         | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00
+ invalid, NUL byte | \xe4dede00 | \x8bc68bcf8bcf | \x00     | invalid byte sequence for encoding "ISO_8859_5": 0x00
+(5 rows)
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+          description           |    inbytes     |    result    | errorat  |                        error                         
+--------------------------------+----------------+--------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f     |          | 
+ valid                          | \x666f6fb648   | \x666f6fb648 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6fa27f |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f     | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fb648 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                                             error                                              
+--------------------------------+----------------+----------------+----------+------------------------------------------------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6fe8b1a1 |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f       | \xa27f   | character with byte sequence 0xa2 0x7f in encoding "BIG5" has no equivalent in encoding "UTF8"
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6fe8b1a1 | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+          description           |    inbytes     |     result     | errorat  |                        error                         
+--------------------------------+----------------+----------------+----------+------------------------------------------------------
+ valid, pure ASCII              | \x666f6f       | \x666f6f       |          | 
+ valid                          | \x666f6fb648   | \x666f6f95e2af |          | 
+ valid, no translation to UTF-8 | \x666f6fa27f   | \x666f6f95a3c1 |          | 
+ invalid, NUL byte              | \x666f6fb60048 | \x666f6f       | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00
+ invalid, NUL byte              | \x666f6fb64800 | \x666f6f95e2af | \x00     | invalid byte sequence for encoding "BIG5": 0x00
+(5 rows)
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+        description        |     inbytes      |     result     |     errorat      |                               error                                
+---------------------------+------------------+----------------+------------------+--------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f       |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x8bc68bcf8bcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \x8bc68bcf     | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x92bedd       |                  | 
+ invalid, incomplete char) | \x92be           | \x             | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f95a3c1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f       | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x             | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x92bedd       | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x             | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                     
+---------------------------+------------------+----------+------------------+---------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xc6cfcf |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xc6cf   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                       error                                                        
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \xe4dede |                  | 
+ invalid,incomplete char   | \x8bc68bcf8b     | \xe4de   | \x8b             | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b
+ valid (in SHIFT_JIS)      | \x92bedd         | \x       | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x92bedd00       | \x       | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5"
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                    error                                                     
+---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x8fdb   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x8fdb   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+        description        |     inbytes      |    result    |     errorat      |                                                    error                                                     
+---------------------------+------------------+--------------+------------------+--------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f     |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x           | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x           | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ valid (in SHIFT_JIS)      | \x92bedd         | \x           | \x92bedd         | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, incomplete char) | \x92be           | \x           | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6fa2a1 |                  | 
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f     | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x           | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \x           | \x92bedd00       | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5"
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x           | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
+        description        |     inbytes      |  result  |     errorat      |                                                     error                                                      
+---------------------------+------------------+----------+------------------+----------------------------------------------------------------------------------------------------------------
+ valid, pure ASCII         | \x666f6f         | \x666f6f |                  | 
+ valid (in KOI8R)          | \x8bc68bcf8bcf   | \x       | \x8bc68bcf8bcf   | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid,incomplete char   | \x8bc68bcf8b     | \x       | \x8bc68bcf8b     | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ valid (in SHIFT_JIS)      | \x92bedd         | \xbedd   |                  | 
+ invalid, incomplete char) | \x92be           | \x       | \x92be           | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe
+ valid (in Big5)           | \x666f6f95a3c1   | \x666f6f | \x95a3c1         | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP"
+ invalid, incomplete char  | \x666f6f95a3     | \x666f6f | \x95a3           | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3
+ invalid, NUL byte         | \x9200bedd       | \x       | \x9200bedd       | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe
+ invalid, NUL byte         | \x92bedd00       | \xbedd   | \x00             | invalid byte sequence for encoding "MULE_INTERNAL": 0x00
+ invalid, NUL byte         | \x8b00c68bcf8bcf | \x       | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00
+(10 rows)
+
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 254ca06d3d..23ba60e395 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1052,13 +1052,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
  oid | proname | oid | conname 
 -----+---------+-----+---------
 (0 rows)
diff --git a/src/test/regress/input/create_function_1.source b/src/test/regress/input/create_function_1.source
index 412e339fcf..6ba37fe63b 100644
--- a/src/test/regress/input/create_function_1.source
+++ b/src/test/regress/input/create_function_1.source
@@ -78,6 +78,10 @@ CREATE FUNCTION test_opclass_options_func(internal)
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
 
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
+
 -- Things that shouldn't work:
 
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
diff --git a/src/test/regress/output/create_function_1.source b/src/test/regress/output/create_function_1.source
index 4d78fa1228..cb38a039bf 100644
--- a/src/test/regress/output/create_function_1.source
+++ b/src/test/regress/output/create_function_1.source
@@ -68,6 +68,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
     RETURNS void
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
+CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
+    LANGUAGE C;
 -- Things that shouldn't work:
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
     AS 'SELECT ''not an integer'';';
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 32ab9ed6b5..1990cbb6a1 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -23,12 +23,15 @@
 #include "access/htup_details.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
 #include "commands/sequence.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "executor/spi.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
@@ -1060,3 +1063,134 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_NULL();
 }
+
+/*
+ * Call an encoding conversion or verification function.
+ *
+ * Arguments:
+ *	string	  bytea -- string to convert
+ *	src_enc	  name  -- source encoding
+ *	dest_enc  name  -- destination encoding
+ *	noError	  bool  -- if set, don't ereport() on invalid or untranslatable
+ *					   input
+ *
+ * Result is a tuple with two attributes:
+ *  int4	-- number of input bytes successfully converted
+ *  bytea	-- converted string
+ */
+PG_FUNCTION_INFO_V1(test_enc_conversion);
+Datum
+test_enc_conversion(PG_FUNCTION_ARGS)
+{
+	bytea	   *string = PG_GETARG_BYTEA_PP(0);
+	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+	int			src_encoding = pg_char_to_encoding(src_encoding_name);
+	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
+	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
+	bool		noError = PG_GETARG_BOOL(3);
+	TupleDesc	tupdesc;
+	char	   *src;
+	char	   *dst;
+	bytea	   *retval;
+	Size		srclen;
+	Size		dstsize;
+	Oid			proc;
+	int			convertedbytes;
+	int			dstlen;
+	Datum		values[2];
+	bool		nulls[2];
+	HeapTuple	tuple;
+
+	if (src_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid source encoding name \"%s\"",
+						src_encoding_name)));
+	if (dest_encoding < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid destination encoding name \"%s\"",
+						dest_encoding_name)));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	srclen = VARSIZE_ANY_EXHDR(string);
+	src = VARDATA_ANY(string);
+
+	if (src_encoding == dest_encoding)
+	{
+		/* just check that the source string is valid */
+		int			oklen;
+
+		oklen = pg_encoding_verifymbstr(src_encoding, src, srclen);
+
+		if (oklen == srclen)
+		{
+			convertedbytes = oklen;
+			retval = string;
+		}
+		else if (!noError)
+		{
+			report_invalid_encoding(src_encoding, src + oklen, srclen - oklen);
+		}
+		else
+		{
+			/*
+			 * build bytea data type structure.
+			 */
+			Assert(oklen < srclen);
+			convertedbytes = oklen;
+			retval = (bytea *) palloc(oklen + VARHDRSZ);
+			SET_VARSIZE(retval, oklen + VARHDRSZ);
+			memcpy(VARDATA(retval), src, oklen);
+		}
+	}
+	else
+	{
+		proc = FindDefaultConversionProc(src_encoding, dest_encoding);
+		if (!OidIsValid(proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
+							pg_encoding_to_char(src_encoding),
+							pg_encoding_to_char(dest_encoding))));
+
+		if (srclen >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of memory"),
+					 errdetail("String of %d bytes is too long for encoding conversion.",
+							   (int) srclen)));
+
+		dstsize = (Size) srclen * MAX_CONVERSION_GROWTH + 1;
+		dst = MemoryContextAlloc(CurrentMemoryContext, dstsize);
+
+		/* perform conversion */
+		convertedbytes = pg_do_encoding_conversion_buf(proc,
+													   src_encoding,
+													   dest_encoding,
+													   (unsigned char *) src, srclen,
+													   (unsigned char *) dst, dstsize,
+													   noError);
+		dstlen = strlen(dst);
+
+		/*
+		 * build bytea data type structure.
+		 */
+		retval = (bytea *) palloc(dstlen + VARHDRSZ);
+		SET_VARSIZE(retval, dstlen + VARHDRSZ);
+		memcpy(VARDATA(retval), dst, dstlen);
+
+		pfree(dst);
+	}
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = Int32GetDatum(convertedbytes);
+	values[1] = PointerGetDatum(retval);
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
+}
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 02cf39f1ce..ea85f20ed8 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -34,3 +34,188 @@ DROP CONVERSION mydef;
 --
 RESET SESSION AUTHORIZATION;
 DROP USER regress_conversion_user;
+
+--
+-- Test built-in conversion functions.
+--
+
+-- Helper function to test a conversion. Uses the test_enc_conversion function
+-- that was created in the create_function_1 test.
+create or replace function test_conv(
+  input IN bytea,
+  src_encoding IN text,
+  dst_encoding IN text,
+
+  result OUT bytea,
+  errorat OUT bytea,
+  error OUT text)
+language plpgsql as
+$$
+declare
+  validlen int;
+begin
+  -- First try to perform the conversion with noError = false. If that errors out,
+  -- capture the error message, and try again with noError = true. The second call
+  -- should succeed and return the position of the error, return that too.
+  begin
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false);
+    errorat = NULL;
+    error := NULL;
+  exception when others then
+    error := sqlerrm;
+    select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true);
+    errorat = substr(input, validlen + 1);
+  end;
+  return;
+end;
+$$;
+
+
+--
+-- UTF-8
+--
+CREATE TABLE utf8_inputs (inbytes bytea, description text);
+insert into utf8_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xc3a4c3b6',	'valid, extra latin chars'),
+  ('\xd184d0bed0be',	'valid, cyrillic'),
+  ('\x666f6fe8b1a1',	'valid, kanji/Chinese'),
+  ('\xe382abe3829a',	'valid, two chars that combine to one in EUC_JIS_2004'),
+  ('\xe382ab',		'only first half of combined char in EUC_JIS_2004'),
+  ('\xe382abe382',	'incomplete combination when converted EUC_JIS_2004'),
+  ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'),
+  ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
+  ('\x66e8b1ff6f6f',	'invalid byte sequence'),
+  ('\x66006f',		'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b1',	'incomplete character at end');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+-- Test conversions from UTF-8
+select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs;
+select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs;
+
+--
+-- EUC_JIS_2004
+--
+CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text);
+insert into euc_jis_2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fbedd',	'valid'),
+  ('\xa5f7',		'valid, translates to two UTF-8 chars '),
+  ('\xbeddbe',		'incomplete char '),
+  ('\x666f6f00bedd',	'invalid, NUL byte'),
+  ('\x666f6fbe00dd',	'invalid, NUL byte'),
+  ('\x666f6fbedd00',	'invalid, NUL byte'),
+  ('\xbe04',		'invalid byte sequence');
+
+-- Test EUC_JIS_2004 verification
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs;
+-- Test conversions from EUC_JIS_2004
+select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs;
+
+--
+-- SHIFT-JIS-2004
+--
+CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text);
+insert into shiftjis2004_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6f8fdb',	'valid'),
+  ('\x666f6f81c0',	'valid, no translation to UTF-8'),
+  ('\x666f6f82f5',	'valid, translates to two UTF-8 chars '),
+  ('\x666f6f8fdb8f',	'incomplete char '),
+  ('\x666f6f820a',	'incomplete char, followed by newline '),
+  ('\x666f6f008fdb',	'invalid, NUL byte'),
+  ('\x666f6f8f00db',	'invalid, NUL byte'),
+  ('\x666f6f8fdb00',	'invalid, NUL byte');
+
+-- Test SHIFT-JIS-2004 verification
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs;
+-- Test conversions from SHIFT-JIS-2004
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs;
+select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs;
+
+--
+-- GB18030
+--
+CREATE TABLE gb18030_inputs (inbytes bytea, description text);
+insert into gb18030_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fcff3',	'valid'),
+  ('\x666f6f8431a530',	'valid, no translation to UTF-8'),
+  ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
+  ('\x666f6f84309c',	'incomplete char '),
+  ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84309c3800', 'invalid, NUL byte'),
+  ('\x666f6f84309c0038', 'invalid, NUL byte');
+
+-- Test GB18030 verification
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test conversions from GB18030
+select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
+
+
+--
+-- ISO-8859-5
+--
+CREATE TABLE iso8859_5_inputs (inbytes bytea, description text);
+insert into iso8859_5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\xe4dede',		'valid'),
+  ('\x00',		'invalid, NUL byte'),
+  ('\xe400dede',	'invalid, NUL byte'),
+  ('\xe4dede00',	'invalid, NUL byte');
+
+-- Test ISO-8859-5 verification
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs;
+-- Test conversions from ISO-8859-5
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs;
+select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs;
+
+--
+-- Big5
+--
+CREATE TABLE big5_inputs (inbytes bytea, description text);
+insert into big5_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x666f6fb648',	'valid'),
+  ('\x666f6fa27f',	'valid, no translation to UTF-8'),
+  ('\x666f6fb60048',	'invalid, NUL byte'),
+  ('\x666f6fb64800',	'invalid, NUL byte');
+
+-- Test Big5 verification
+select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs;
+-- Test conversions from Big5
+select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs;
+select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs;
+
+--
+-- MULE_INTERNAL
+--
+CREATE TABLE mic_inputs (inbytes bytea, description text);
+insert into mic_inputs  values
+  ('\x666f6f',		'valid, pure ASCII'),
+  ('\x8bc68bcf8bcf',	'valid (in KOI8R)'),
+  ('\x8bc68bcf8b',	'invalid,incomplete char'),
+  ('\x92bedd',		'valid (in SHIFT_JIS)'),
+  ('\x92be',		'invalid, incomplete char)'),
+  ('\x666f6f95a3c1',	'valid (in Big5)'),
+  ('\x666f6f95a3',	'invalid, incomplete char'),
+  ('\x9200bedd',	'invalid, NUL byte'),
+  ('\x92bedd00',	'invalid, NUL byte'),
+  ('\x8b00c68bcf8bcf',	'invalid, NUL byte');
+
+-- Test MULE_INTERNAL verification
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs;
+-- Test conversions from MULE_INTERNAL
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs;
+select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs;
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index bbd3834b63..0469174598 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -556,13 +556,14 @@ WHERE p1.conproc = 0 OR
 SELECT p.oid, p.proname, c.oid, c.conname
 FROM pg_proc p, pg_conversion c
 WHERE p.oid = c.conproc AND
-    (p.prorettype != 'void'::regtype OR p.proretset OR
-     p.pronargs != 5 OR
+    (p.prorettype != 'int4'::regtype OR p.proretset OR
+     p.pronargs != 6 OR
      p.proargtypes[0] != 'int4'::regtype OR
      p.proargtypes[1] != 'int4'::regtype OR
      p.proargtypes[2] != 'cstring'::regtype OR
      p.proargtypes[3] != 'internal'::regtype OR
-     p.proargtypes[4] != 'int4'::regtype);
+     p.proargtypes[4] != 'int4'::regtype OR
+     p.proargtypes[5] != 'bool'::regtype);
 
 -- Check for conprocs that don't perform the specific conversion that
 -- pg_conversion alleges they do, by trying to invoke each conversion
-- 
2.22.0

v8-0002-Replace-pg_utf8_verifystr-with-two-faster-impleme.patchapplication/octet-stream; name=v8-0002-Replace-pg_utf8_verifystr-with-two-faster-impleme.patchDownload

From 2c0428b97af37a5a08937ff920f0e57be14f950b Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 19 Mar 2021 15:09:53 -0400
Subject: [PATCH v8 2/2] Replace pg_utf8_verifystr() with two faster
 implementations:

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.

On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 114 +++--
 configure.ac                             |  61 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  27 +-
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  86 ++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 129 ++++++
 src/port/pg_utf8_sse42.c                 | 537 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++
 src/test/regress/expected/conversion.out |  52 +++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1088 insertions(+), 67 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 3fd4cecbeb..2b489d37dc 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17746,14 +17747,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17767,32 +17768,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17806,26 +17810,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -17960,12 +17967,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -17979,7 +17986,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18032,6 +18039,61 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 2f1585adc0..17c0b536ef 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2025,14 +2025,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2073,12 +2073,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2092,7 +2092,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2129,6 +2129,51 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 7a7cc21d8d..50abc4b259 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -908,6 +908,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..89132243b0
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..7a7e000b9d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	chklocale.o \
 	erand48.o \
 	inet_net_ntop.o \
@@ -88,6 +89,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1efedc2429
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..fc7596940a
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII, since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		unsigned char inbuf[sizeof(__m128i)];
+
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index a184404e21..53651a2fec 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -114,10 +114,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a4f5cc4bdb..971626002b 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -490,6 +490,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.22.0

#24

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#23)

1 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

v9 is just a rebase.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v9-0001-Replace-pg_utf8_verifystr-with-two-faster-impleme.patchapplication/x-patch; name=v9-0001-Replace-pg_utf8_verifystr-with-two-faster-impleme.patchDownload

From e876049ad3b153e8725ab23f65ae8f021a970470 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Thu, 1 Apr 2021 08:24:05 -0400
Subject: [PATCH v9] Replace pg_utf8_verifystr() with two faster
 implementations:

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.

On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 114 +++--
 configure.ac                             |  61 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  27 +-
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  86 ++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 129 ++++++
 src/port/pg_utf8_sse42.c                 | 537 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++
 src/test/regress/expected/conversion.out |  52 +++
 src/test/regress/sql/conversion.sql      |  28 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1088 insertions(+), 67 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 06ad9aeb71..4d70f10fab 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 92193f35fb..a67f797c98 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,51 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 5e2255a2f5..8d5f9114ab 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -920,6 +920,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..89132243b0
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1efedc2429
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..fc7596940a
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII, since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		unsigned char inbuf[sizeof(__m128i)];
+
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index bc65185130..be17fd3ab4 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -114,10 +114,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 710f26f8ab..de3c62af7c 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -494,6 +494,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.22.0

#25

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#24)

2 attachment(s)

speed up verifying UTF-8

For v10, I've split the patch up into two parts. 0001 uses pure C
everywhere. This is much smaller and easier to review, and gets us the most
bang for the buck.

One concern Heikki raised upthread is that platforms with poor
unaligned-memory access will see a regression. We could easily add an
#ifdef to take care of that, but I haven't done so here.

To recap: On ascii-only input with storage taken out of the picture,
profiles of COPY FROM show a reduction from nealy 10% down to just over 1%.
In microbenchmarks found earlier in this thread, this works out to about 7
times faster. On multibyte/mixed input, 0001 is a bit faster, but not
really enough to make a difference in copy performance.

0002 adds the SSE4 implementation on x86-64, and is equally fast on all
input, at the cost of greater complexity.

To reflect the split, I've changed the thread subject and the commitfest
title.
--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v10-0001-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v10-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From 0e223a8702491d6bcd06e55eab4d83be7455b537 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 2 Jun 2021 11:48:12 -0400
Subject: [PATCH v10 1/2] Rewrite pg_utf8_verifystr() for speed

Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(),
rewrite this function in a manner loosely based on the fallback that
is part of the simdjson library.

Verifying multibyte UTF-8 text is modestly faster, but the biggest
improvement is in verifying ASCII, which is now about 7 times faster.
---
 src/common/wchar.c                       | 125 ++++++++++++++++++++++-
 src/test/regress/expected/conversion.out |  52 ++++++++++
 src/test/regress/sql/conversion.sql      |  28 +++++
 3 files changed, 202 insertions(+), 3 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..29eeeae859 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,47 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1761,24 +1802,102 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
 	while (len > 0)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
 				break;
 			l = 1;
 		}
-		else
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
 				break;
 		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..07ad2577ff 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..cb35112901 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

v10-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchapplication/octet-stream; name=v10-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchDownload

From 85c3b2f7c659f0374e695245d257fc8e647e40ec Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 2 Jun 2021 11:59:39 -0400
Subject: [PATCH v10 2/2] Use SSE instructions for pg_utf8_verifystr() where
 available

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.
---
 config/c-compiler.m4            |  28 +-
 configure                       | 114 +++++--
 configure.ac                    |  61 +++-
 src/Makefile.global.in          |   3 +
 src/common/wchar.c              | 146 +--------
 src/include/pg_config.h.in      |   9 +
 src/include/port/pg_utf8.h      |  86 +++++
 src/port/Makefile               |   6 +
 src/port/pg_utf8_fallback.c     | 129 ++++++++
 src/port/pg_utf8_sse42.c        | 537 ++++++++++++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c |  68 ++++
 src/tools/msvc/Mkvcbuild.pm     |   4 +
 src/tools/msvc/Solution.pm      |   3 +
 13 files changed, 1008 insertions(+), 186 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index e9b98f442f..1663e2d466 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 3b42d8bdc9..fff229e570 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,51 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8f05840821..f54433933b 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 29eeeae859..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,49 +13,9 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
-/* for UTF-8 */
-#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
-#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
-#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
-#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
-
-/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
-#define HAS_ZERO(chunk) ( \
-	((chunk) - UINT64CONST(0x0101010101010101)) & \
-	 ~(chunk) & \
-	 UINT64CONST(0x8080808080808080))
-
-/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
-static inline int
-check_ascii(const unsigned char *s, int len)
-{
-	uint64		half1,
-				half2,
-				highbits_set;
-
-	if (len >= 2 * sizeof(uint64))
-	{
-		memcpy(&half1, s, sizeof(uint64));
-		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
-
-		/* If there are zero bytes, bail and let the slow path handle it. */
-		if (HAS_ZERO(half1) || HAS_ZERO(half2))
-			return 0;
-
-		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
-
-		if (!highbits_set)
-			return 2 * sizeof(uint64);
-		else
-			return 0;
-	}
-	else
-		return 0;
-}
-
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1801,108 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-	unsigned char b1,
-				b2,
-				b3,
-				b4;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, len);
-		if (l)
-		{
-			s += l;
-			len -= l;
-			continue;
-		}
-
-		/* Found non-ASCII or zero above, so verify a single character. */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		/* code points U+0080 through U+07FF */
-		else if (IS_TWO_BYTE_LEAD(*s))
-		{
-			l = 2;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-
-			if (!IS_CONTINUATION_BYTE(b2))
-				break;
-
-			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-			if (b1 < 0xC2)
-				break;
-		}
-		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-		else if (IS_THREE_BYTE_LEAD(*s))
-		{
-			l = 3;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3))
-				break;
-
-			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-			if (b1 == 0xE0 && b2 < 0xA0)
-				break;
-
-			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-			if (b1 == 0xED && b2 > 0x9F)
-				break;
-		}
-		/* code points U+010000 through U+10FFFF */
-		else if (IS_FOUR_BYTE_LEAD(*s))
-		{
-			l = 4;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-			b4 = *(s + 3);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3) ||
-				!IS_CONTINUATION_BYTE(b4))
-				break;
-
-			/*
-			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if (b1 == 0xF0 && b2 < 0x90)
-				break;
-
-			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
-			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
-				break;
-		}
-		else
-			/* invalid byte */
-			break;
-
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 783b8fc1ba..1aa839d258 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -926,6 +926,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..89132243b0
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1efedc2429
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..fc7596940a
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII, since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		unsigned char inbuf[sizeof(__m128i)];
+
+		/*
+		 * Back-fill the remainder with some kind of ASCII so that we have a
+		 * whole register. Normally we memset buffers with zero, but if we did
+		 * that, we couldn't reuse our check for zero bytes using vector
+		 * operations.
+		 */
+		memset(inbuf, 0x20, sizeof(__m128i));
+		memcpy(inbuf, s, len);
+
+		input = vload(inbuf);
+
+		check_for_zeros(input, &error);
+		check_utf8_bytes(prev, input, &error);
+
+		/*
+		 * We must also check that the remainder does not end with an
+		 * incomplete code point. This would only slip past check_utf8_bytes()
+		 * if the remainder is 16 bytes in length, but it's not worth adding a
+		 * branch for that.
+		 */
+		error = bitwise_or(error, is_incomplete(input));
+
+		if (to_bool(error))
+		{
+			/*
+			 * If we encounter errors in the remainder, we need to be a bit
+			 * more careful, since it's possible that the end of the input
+			 * falls within a multibyte sequence, and we don't want to repeat
+			 * the work we've already done. In that case, we just walk
+			 * backwards into the previous chunk, if any, to find the last
+			 * byte that could have been the start of a character. For short
+			 * strings, this will start over from the beginning, but that's
+			 * fine.
+			 */
+			while (s > start)
+			{
+				s--;
+				len++;
+
+				if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+					IS_TWO_BYTE_LEAD(*s) ||
+					IS_THREE_BYTE_LEAD(*s) ||
+					IS_FOUR_BYTE_LEAD(*s))
+					break;
+			}
+			return orig_len - len + pg_validate_utf8_fallback(s, len);
+		}
+		else
+			return orig_len;
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 233ddbf4c2..9b8bad9044 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 3c5fe5dddc..09f2bcf0a7 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -499,6 +499,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.31.1

#26

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: John Naylor (#25)

Re: speed up verifying UTF-8

On 02/06/2021 19:26, John Naylor wrote:

For v10, I've split the patch up into two parts. 0001 uses pure C
everywhere. This is much smaller and easier to review, and gets us the
most bang for the buck.

One concern Heikki raised upthread is that platforms with poor
unaligned-memory access will see a regression. We could easily add an
#ifdef to take care of that, but I haven't done so here.

To recap: On ascii-only input with storage taken out of the picture,
profiles of COPY FROM show a reduction from nealy 10% down to just over
1%. In microbenchmarks found earlier in this thread, this works out to
about 7 times faster. On multibyte/mixed input, 0001 is a bit faster,
but not really enough to make a difference in copy performance.

Nice!

This kind of bit-twiddling is fun, so I couldn't resist tinkering with
it, to see if we can shave some more instructions from it:

+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* If there are zero bytes, bail and let the slow path handle it. */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbits_set)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}

Some ideas:

1. Better to check if any high bits are set first. We care more about
the speed of that than of detecting zero bytes, because input with high
bits is valid but zeros are an error.

2. Since we check that there are no high bits, we can do the zero-checks
with fewer instructions like this:

/* NB: this is only correct if 'chunk' doesn't have any high bits set */
#define HAS_ZERO(chunk) ( \
((chunk) + \
UINT64CONST(0x7f7f7f7f7f7f7f7f)) & \
UINT64CONST(0x8080808080808080) == UINT64CONST(0x8080808080808080))

3. It's probably cheaper perform the HAS_ZERO check just once on (half1
| half2). We have to compute (half1 | half2) anyway.

Putting all that together:

/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
static inline int
check_ascii(const unsigned char *s, int len)
{
uint64 half1,
half2,
highbits_set;
uint64 x;

if (len >= 2 * sizeof(uint64))
{
memcpy(&half1, s, sizeof(uint64));
memcpy(&half2, s + sizeof(uint64), sizeof(uint64));

/* Check if any bytes in this chunk have the high bit set. */
highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
if (highbits_set)
return 0;

/*
* Check if there are any zero bytes in this chunk. This is only correct
* if there are no high bits set, but we checked that already.
*/
x = (half1 | half2) + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

return 2 * sizeof(uint64);
}
else
return 0;
}

In quick testing, that indeed compiles into fewer instructions. With
GCC, there's no measurable difference in performance. But with clang,
this version is much faster than the original, because the original
version is much slower than when compiled with GCC. In other words, this
version seems to avoid some clang misoptimization. I tested only with
ASCII input, I haven't tried other cases.

What test set have you been using for performance testing this? I'd like
to know how this version compares, and I could also try running it on my
old raspberry pi, which is more strict about alignmnt.

0002 adds the SSE4 implementation on x86-64, and is equally fast on all
input, at the cost of greater complexity.

Didn't look closely, but seems reasonable at a quick glance.

- Heikki

#27

Greg Stark

stark@mit.edu

over 4 years ago

In reply to: Heikki Linnakangas (#26)

Re: speed up verifying UTF-8

3. It's probably cheaper perform the HAS_ZERO check just once on (half1

| half2). We have to compute (half1 | half2) anyway.

Wouldn't you have to check (half1 & half2) ?

#28

Greg Stark

stark@mit.edu

over 4 years ago

In reply to: Greg Stark (#27)

Re: speed up verifying UTF-8

I haven't looked at the surrounding code. Are we processing all the
COPY data in one long stream or processing each field individually? If
we're processing much more than 128 bits and happy to detect NUL
errors only at the end after wasting some work then you could hoist
that has_zero check entirely out of the loop (removing the branch
though it's probably a correctly predicted branch anyways).

Do something like:

zero_accumulator = zero_accumulator & next_chunk

in the loop and then only at the very end check for zeros in that.

#29

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Greg Stark (#28)

Re: speed up verifying UTF-8

On Thu, Jun 3, 2021 at 10:42 AM Greg Stark <stark@mit.edu> wrote:

I haven't looked at the surrounding code. Are we processing all the
COPY data in one long stream or processing each field individually?

It happens on 64kB chunks.

If
we're processing much more than 128 bits and happy to detect NUL
errors only at the end after wasting some work then you could hoist
that has_zero check entirely out of the loop (removing the branch
though it's probably a correctly predicted branch anyways).

Do something like:

zero_accumulator = zero_accumulator & next_chunk

in the loop and then only at the very end check for zeros in that.

That's the approach taken in the SSE4 patch, and in fact that's the logical
way to do it there. I hadn't considered doing it that way in the pure C
case, but I think it's worth trying.

--
John Naylor
EDB: http://www.enterprisedb.com

#30

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#29)

Re: speed up verifying UTF-8

I wrote:

On Thu, Jun 3, 2021 at 10:42 AM Greg Stark <stark@mit.edu> wrote:

If
we're processing much more than 128 bits and happy to detect NUL
errors only at the end after wasting some work then you could hoist
that has_zero check entirely out of the loop (removing the branch
though it's probably a correctly predicted branch anyways).

Do something like:

zero_accumulator = zero_accumulator & next_chunk

in the loop and then only at the very end check for zeros in that.

That's the approach taken in the SSE4 patch, and in fact that's the

logical way to do it there. I hadn't considered doing it that way in the
pure C case, but I think it's worth trying.

Actually, I spoke too quickly. We can't have an error accumulator in the C
case because we need to return how many bytes were valid. In fact, in the
SSE case, it checks the error vector at the end and then reruns with the
fallback case to count the valid bytes.

--
John Naylor
EDB: http://www.enterprisedb.com

#31

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#26)

Re: speed up verifying UTF-8

On Thu, Jun 3, 2021 at 9:16 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

Some ideas:

1. Better to check if any high bits are set first. We care more about
the speed of that than of detecting zero bytes, because input with high
bits is valid but zeros are an error.

2. Since we check that there are no high bits, we can do the zero-checks
with fewer instructions like this:

Both ideas make sense, and I like the shortcut we can take with the zero
check. I think Greg is right that the zero check needs “half1 & half2”, so
I tested with that (updated patches attached).

What test set have you been using for performance testing this? I'd like

The microbenchmark is the same one you attached to [1]/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi, which I extended
with a 95% multibyte case. With the new zero check:

clang 12.0.5 / MacOS:

master:

chinese | mixed | ascii
---------+-------+-------
981 | 688 | 371

0001:

chinese | mixed | ascii
---------+-------+-------
932 | 548 | 110

plus optimized zero check:

chinese | mixed | ascii
---------+-------+-------
689 | 573 | 59

It makes sense that the Chinese text case is faster since the zero check is
skipped.

gcc 4.8.5 / Linux:

master:

chinese | mixed | ascii
---------+-------+-------
2561 | 1493 | 825

0001:

chinese | mixed | ascii
---------+-------+-------
2968 | 1035 | 158

plus optimized zero check:

chinese | mixed | ascii
---------+-------+-------
2413 | 1078 | 137

The second machine is a bit older and has an old compiler, but there is
still a small speed increase. In fact, without Heikki's tweaks, 0001
regresses on multibyte.

(Note: I'm not seeing the 7x improvement I claimed for 0001 here, but that
was from memory and I think that was a different machine and newer gcc. We
can report a range of results as we proceed.)

[1]: /messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi
/messages/by-id/06d45421-61b8-86dd-e765-f1ce527a5a2f@iki.fi

--
John Naylor
EDB: http://www.enterprisedb.com

#32

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: Greg Stark (#27)

Re: speed up verifying UTF-8

On 03/06/2021 17:33, Greg Stark wrote:

3. It's probably cheaper perform the HAS_ZERO check just once on (half1

| half2). We have to compute (half1 | half2) anyway.

Wouldn't you have to check (half1 & half2) ?

Ah, you're right of course. But & is not quite right either, it will
give false positives. That's ok from a correctness point of view here,
because we then fall back to checking byte by byte, but I don't think
it's a good tradeoff.

I think this works, however:

/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
static inline int
check_ascii(const unsigned char *s, int len)
{
uint64 half1,
half2,
highbits_set;
uint64 x1,
x2;
uint64 x;

if (len >= 2 * sizeof(uint64))
{
memcpy(&half1, s, sizeof(uint64));
memcpy(&half2, s + sizeof(uint64), sizeof(uint64));

/* Check if any bytes in this chunk have the high bit set. */
highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
if (highbits_set)
return 0;

/*
* Check if there are any zero bytes in this chunk.
*
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. We already checked that none of the bytes had
* the high bit set previously, so the max value each byte can have
* after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
* worry about carrying over to the next byte.
*/
x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);

/* then check that the high bit is set in each byte. */
x = (x1 | x2);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

return 2 * sizeof(uint64);
}
else
return 0;
}

- Heikki

#33

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#32)

Re: speed up verifying UTF-8

On Thu, Jun 3, 2021 at 3:08 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

On 03/06/2021 17:33, Greg Stark wrote:

3. It's probably cheaper perform the HAS_ZERO check just once on (half1

| half2). We have to compute (half1 | half2) anyway.

Wouldn't you have to check (half1 & half2) ?

Ah, you're right of course. But & is not quite right either, it will
give false positives. That's ok from a correctness point of view here,
because we then fall back to checking byte by byte, but I don't think
it's a good tradeoff.

Ah, of course.

/*
* Check if there are any zero bytes in this chunk.
*
* First, add 0x7f to each byte. This sets the high bit

in each byte,

* unless it was a zero. We already checked that none of

the bytes had

* the high bit set previously, so the max value each

byte can have

* after the addition is 0x7f + 0x7f = 0xfe, and we don't

need to

* worry about carrying over to the next byte.
*/
x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);

/* then check that the high bit is set in each byte. */
x = (x1 | x2);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

That seems right, I'll try that and update the patch. (Forgot to attach
earlier anyway)

--
John Naylor
EDB: http://www.enterprisedb.com

#34

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: John Naylor (#33)

Re: speed up verifying UTF-8

On 03/06/2021 22:10, John Naylor wrote:

On Thu, Jun 3, 2021 at 3:08 PM Heikki Linnakangas <hlinnaka@iki.fi
<mailto:hlinnaka@iki.fi>> wrote:

x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);

/* then check that the high bit is set in each byte. */
x = (x1 | x2);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

That seems right, I'll try that and update the patch. (Forgot to attach
earlier anyway)

Ugh, actually that has the same issue as before. If one of the bytes is
in one half is zero, but not in the other half, this fail to detect it.
Sorry for the noise..

- Heikki

#35

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: Heikki Linnakangas (#34)

Re: speed up verifying UTF-8

On 03/06/2021 22:16, Heikki Linnakangas wrote:

On 03/06/2021 22:10, John Naylor wrote:

On Thu, Jun 3, 2021 at 3:08 PM Heikki Linnakangas <hlinnaka@iki.fi
<mailto:hlinnaka@iki.fi>> wrote:

x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);

/* then check that the high bit is set in each byte. */
x = (x1 | x2);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

That seems right, I'll try that and update the patch. (Forgot to attach
earlier anyway)

Ugh, actually that has the same issue as before. If one of the bytes is
in one half is zero, but not in the other half, this fail to detect it.
Sorry for the noise..

If you replace (x1 | x2) with (x1 & x2) above, I think it's correct.

- Heikki

#36

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#35)

2 attachment(s)

Re: speed up verifying UTF-8

On Thu, Jun 3, 2021 at 3:22 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

On 03/06/2021 22:16, Heikki Linnakangas wrote:

On 03/06/2021 22:10, John Naylor wrote:

On Thu, Jun 3, 2021 at 3:08 PM Heikki Linnakangas <hlinnaka@iki.fi
<mailto:hlinnaka@iki.fi>> wrote:

x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);

/* then check that the high bit is set in each

byte. */

x = (x1 | x2);
x &= UINT64CONST(0x8080808080808080);
if (x != UINT64CONST(0x8080808080808080))
return 0;

If you replace (x1 | x2) with (x1 & x2) above, I think it's correct.

After looking at it again with fresh eyes, I agree this is correct. I
modified the regression tests to pad the input bytes with ascii so that the
code path that works on 16-bytes at a time is tested. I use both UTF-8
input tables for some of the additional tests. There is a de facto
requirement that the descriptions are unique across both of the input
tables. That could be done more elegantly, but I wanted to keep things
simple for now.

v11-0001 is an improvement over v10:

clang 12.0.5 / MacOS:

master:

chinese | mixed | ascii
---------+-------+-------
975 | 686 | 369

v10-0001:

chinese | mixed | ascii
---------+-------+-------
930 | 549 | 109

v11-0001:

chinese | mixed | ascii
---------+-------+-------
687 | 440 | 64

gcc 4.8.5 / Linux (older machine)

master:

chinese | mixed | ascii
---------+-------+-------
2559 | 1495 | 825

v10-0001:

chinese | mixed | ascii
---------+-------+-------
2966 | 1034 | 156

v11-0001:

chinese | mixed | ascii
---------+-------+-------
2242 | 824 | 140

Previous testing on POWER8 and Arm64 leads me to expect similar results
there as well.

I also looked again at 0002 and decided I wasn't quite happy with the test
coverage. Previously, the code padded out a short input with ascii so that
the 16-bytes-at-a-time code path was always exercised. However, that
required some finicky complexity and still wasn't adequate. For v11, I
ripped that out and put the responsibility on the regression tests to make
sure the various code paths are exercised.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v11-0001-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v11-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From 96dcf1ae9d73df3287d521b009b39c2728a993af Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 6 Jun 2021 11:13:09 -0400
Subject: [PATCH v11 1/2] Rewrite pg_utf8_verifystr() for speed

Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(),
rewrite this function in a manner loosely based on the fallback that
is part of the simdjson library.

Verifying multibyte UTF-8 text is modestly faster, but the biggest
improvement is in verifying ASCII,  which is now around 6x times
faster, depending on platform.
---
 src/common/wchar.c                       | 134 ++++++++++++++++++++++-
 src/test/regress/expected/conversion.out | 106 ++++++++++++++++--
 src/test/regress/sql/conversion.sql      |  66 ++++++++++-
 3 files changed, 294 insertions(+), 12 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..2805d01f7f 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,56 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set,
+				x1,
+				x2,
+				x;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+		x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		x = (x1 & x2) & UINT64CONST(0x8080808080808080);
+		if (x != UINT64CONST(0x8080808080808080))
+			return 0;
+
+		return 2 * sizeof(uint64);
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1761,24 +1811,102 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
 	while (len > 0)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
 				break;
 			l = 1;
 		}
-		else
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
 				break;
 		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..9315ad3abd 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -85,7 +137,7 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
@@ -102,10 +154,48 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
  invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
                      description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
@@ -121,7 +211,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -139,7 +229,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -157,7 +247,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -175,7 +265,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -193,7 +283,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_i
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -211,7 +301,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..8ad5290f4c 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -87,11 +115,47 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
 select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
-- 
2.31.1

v11-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchapplication/octet-stream; name=v11-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchDownload

From 3b14aa4dcd03d0f86aec3ba78b2750b650f9cb67 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 6 Jun 2021 11:23:38 -0400
Subject: [PATCH v11 2/2] Use SSE instructions for pg_utf8_verifystr() where
 available

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 114 +++--
 configure.ac                             |  61 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       | 155 +------
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               |  95 +++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 129 ++++++
 src/port/pg_utf8_sse42.c                 | 508 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++
 src/test/regress/expected/conversion.out |  34 ++
 src/test/regress/sql/conversion.sql      |  31 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1053 insertions(+), 195 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index e9b98f442f..1663e2d466 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 3b42d8bdc9..fff229e570 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,51 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8f05840821..f54433933b 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 2805d01f7f..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,58 +13,9 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
-/* for UTF-8 */
-#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
-#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
-#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
-#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
-
-/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
-static inline int
-check_ascii(const unsigned char *s, int len)
-{
-	uint64		half1,
-				half2,
-				highbits_set,
-				x1,
-				x2,
-				x;
-
-	if (len >= 2 * sizeof(uint64))
-	{
-		memcpy(&half1, s, sizeof(uint64));
-		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
-
-		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080);
-		if (highbits_set)
-			return 0;
-
-		/*
-		 * Check if there are any zero bytes in this chunk.
-		 *
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. We already checked that none of the bytes had
-		 * the high bit set previously, so the max value each byte can have
-		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
-		 * worry about carrying over to the next byte.
-		 */
-		x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
-		x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
-
-		/* Then check that the high bit is set in each byte. */
-		x = (x1 & x2) & UINT64CONST(0x8080808080808080);
-		if (x != UINT64CONST(0x8080808080808080))
-			return 0;
-
-		return 2 * sizeof(uint64);
-	}
-	else
-		return 0;
-}
-
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1810,108 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-	unsigned char b1,
-				b2,
-				b3,
-				b4;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, len);
-		if (l)
-		{
-			s += l;
-			len -= l;
-			continue;
-		}
-
-		/* Found non-ASCII or zero above, so verify a single character. */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		/* code points U+0080 through U+07FF */
-		else if (IS_TWO_BYTE_LEAD(*s))
-		{
-			l = 2;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-
-			if (!IS_CONTINUATION_BYTE(b2))
-				break;
-
-			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-			if (b1 < 0xC2)
-				break;
-		}
-		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-		else if (IS_THREE_BYTE_LEAD(*s))
-		{
-			l = 3;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3))
-				break;
-
-			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-			if (b1 == 0xE0 && b2 < 0xA0)
-				break;
-
-			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-			if (b1 == 0xED && b2 > 0x9F)
-				break;
-		}
-		/* code points U+010000 through U+10FFFF */
-		else if (IS_FOUR_BYTE_LEAD(*s))
-		{
-			l = 4;
-			if (len < l)
-				break;
-
-			b1 = *s;
-			b2 = *(s + 1);
-			b3 = *(s + 2);
-			b4 = *(s + 3);
-
-			if (!IS_CONTINUATION_BYTE(b2) ||
-				!IS_CONTINUATION_BYTE(b3) ||
-				!IS_CONTINUATION_BYTE(b4))
-				break;
-
-			/*
-			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
-			 */
-			if (b1 == 0xF0 && b2 < 0x90)
-				break;
-
-			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
-			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
-				break;
-		}
-		else
-			/* invalid byte */
-			break;
-
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 783b8fc1ba..1aa839d258 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -926,6 +926,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..d3c2e757a4
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set,
+				x1,
+				x2,
+				x;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+		x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		x = (x1 & x2) & UINT64CONST(0x8080808080808080);
+		if (x != UINT64CONST(0x8080808080808080))
+			return 0;
+
+		return 2 * sizeof(uint64);
+	}
+	else
+		return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1efedc2429
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..cd050ec2bf
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,508 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	/*
+	 * NB: This check must be strictly greater-than, otherwise an invalid byte
+	 * at the end might not get detected.
+	 */
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII, since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		/*
+		 * For short sequences, just use the fallback. For the last few bytes
+		 * of a longer sequence, we walk backwards into the previous chunk,
+		 * find the last byte that could have been the start of a valid
+		 * character, and start the fallback from there.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+				IS_TWO_BYTE_LEAD(*s) ||
+				IS_THREE_BYTE_LEAD(*s) ||
+				IS_FOUR_BYTE_LEAD(*s))
+				break;
+		}
+		return orig_len - len + pg_validate_utf8_fallback(s, len);
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 9315ad3abd..8d14c96c10 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -196,6 +196,40 @@ order by description;
 -------------+------------+---------------------
 (0 rows)
 
+-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more than 16 bytes
+-- of ASCII. 
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
                      description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8ad5290f4c..1255b047b6 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -156,6 +156,37 @@ using (description)
 where p.error is distinct from b.error
 order by description;
 
+-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more than 16 bytes
+-- of ASCII.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
 select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 233ddbf4c2..9b8bad9044 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 3c5fe5dddc..09f2bcf0a7 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -499,6 +499,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.31.1

#37

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: John Naylor (#31)

Re: speed up verifying UTF-8

On 03/06/2021 21:58, John Naylor wrote:

What test set have you been using for performance testing this? I'd like

The microbenchmark is the same one you attached to [1], which I extended
with a 95% multibyte case.

Could you share the exact test you're using? I'd like to test this on my
old raspberry pi, out of curiosity.

- Heikki

#38

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#37)

2 attachment(s)

Re: speed up verifying UTF-8

On Mon, Jun 7, 2021 at 8:24 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

On 03/06/2021 21:58, John Naylor wrote:

The microbenchmark is the same one you attached to [1], which I extended
with a 95% multibyte case.

Could you share the exact test you're using? I'd like to test this on my
old raspberry pi, out of curiosity.

Sure, attached.

--
John Naylor
EDB: http://www.enterprisedb.com

#39

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: John Naylor (#38)

Re: speed up verifying UTF-8

On 07/06/2021 15:39, John Naylor wrote:

On Mon, Jun 7, 2021 at 8:24 AM Heikki Linnakangas <hlinnaka@iki.fi
<mailto:hlinnaka@iki.fi>> wrote:

On 03/06/2021 21:58, John Naylor wrote:

The microbenchmark is the same one you attached to [1], which I

extended

with a 95% multibyte case.

Could you share the exact test you're using? I'd like to test this on my
old raspberry pi, out of curiosity.

Sure, attached.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com>

Results from chipmunk, my first generation Raspberry Pi:

Master:

chinese | mixed | ascii
---------+-------+-------
25392 | 16287 | 10295
(1 row)

v11-0001-Rewrite-pg_utf8_verifystr-for-speed.patch:

chinese | mixed | ascii
---------+-------+-------
17739 | 10854 | 4121
(1 row)

So that's good.

What is the worst case scenario for this algorithm? Something where the
new fast ASCII check never helps, but is as fast as possible with the
old code. For that, I added a repeating pattern of '123456789012345ä' to
the test set (these results are from my Intel laptop, not the raspberry pi):

Master:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
1333 | 757 | 410 | 573
(1 row)

v11-0001-Rewrite-pg_utf8_verifystr-for-speed.patch:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
942 | 470 | 66 | 1249
(1 row)

So there's a regression with that input. Maybe that's acceptable, this
is the worst case, after all. Or you could tweak check_ascii for a
different performance tradeoff, by checking the two 64-bit words
separately and returning "8" if the failure happens in the second word.
And I haven't tried the SSE patch yet, maybe that compensates for this.

- Heikki

#40

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#39)

1 attachment(s)

Re: speed up verifying UTF-8

On Wed, Jun 9, 2021 at 7:02 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

What is the worst case scenario for this algorithm? Something where the
new fast ASCII check never helps, but is as fast as possible with the
old code. For that, I added a repeating pattern of '123456789012345ä' to
the test set (these results are from my Intel laptop, not the raspberry

pi):

Master:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
1333 | 757 | 410 | 573
(1 row)

v11-0001-Rewrite-pg_utf8_verifystr-for-speed.patch:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
942 | 470 | 66 | 1249
(1 row)

I get a much smaller regression on my laptop with clang 12:

master:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
978 | 685 | 370 | 452

v11-0001:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
686 | 438 | 64 | 595

So there's a regression with that input. Maybe that's acceptable, this
is the worst case, after all. Or you could tweak check_ascii for a
different performance tradeoff, by checking the two 64-bit words
separately and returning "8" if the failure happens in the second word.

For v12 (unformatted and without 0002 rebased) I tried the following:
--
highbits_set = (half1) & UINT64CONST(0x8080808080808080);
if (highbits_set)
return 0;

x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x1 &= UINT64CONST(0x8080808080808080);
if (x1 != UINT64CONST(0x8080808080808080))
return 0;

/* now we know we have at least 8 bytes of valid ascii, so if any of these
tests fails, return that */

highbits_set = (half2) & UINT64CONST(0x8080808080808080);
if (highbits_set)
return sizeof(uint64);

x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
x2 &= UINT64CONST(0x8080808080808080);
if (x2 != UINT64CONST(0x8080808080808080))
return sizeof(uint64);

return 2 * sizeof(uint64);
--
and got this:

chinese | mixed | ascii | mixed2
---------+-------+-------+--------
674 | 499 | 170 | 421

Pure ascii is significantly slower, but the regression is gone.

I used the string repeat('123456789012345ä', 3647) to match the ~62000
bytes in the other strings (62000 / 17 = 3647)

And I haven't tried the SSE patch yet, maybe that compensates for this.

I would expect that this case is identical to all-multibyte. The worst case
for SSE might be alternating 16-byte chunks of ascii-only and chunks of
multibyte, since that's one of the few places it branches. In simdjson,
they check ascii on 64 byte blocks at a time ((c1 | c2) | (c3 | c4)) and
check only the previous block's "chunk 4" for incomplete sequences at the
end. It's a bit messier, so I haven't done it, but it's an option.

Also, if SSE is accepted into the tree, then the C fallback is only
important on platforms like PowerPC64 and Arm64, so we can make
the tradeoff by testing those more carefully. I'll test on PowerPC soon.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v12-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v12-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

 src/common/wchar.c                       | 131 ++++++++++++++++++++++++++++++-
 src/test/regress/expected/conversion.out | 106 +++++++++++++++++++++++--
 src/test/regress/sql/conversion.sql      |  66 +++++++++++++++-
 3 files changed, 291 insertions(+), 12 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..265572200e 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,53 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set,
+				x1,
+				x2;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (half1) & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+		x1 &= UINT64CONST(0x8080808080808080);
+		if (x1 != UINT64CONST(0x8080808080808080))
+			return 0;
+
+		/* now we know we have at least 8 bytes of valid ascii, so if any of these tests fails, return that */
+		highbits_set = (half2) & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return sizeof(uint64);
+
+		x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+		x2 &= UINT64CONST(0x8080808080808080);
+		if (x2 != UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+
+		return 2 * sizeof(uint64);
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1761,24 +1808,102 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
 	while (len > 0)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
 				break;
 			l = 1;
 		}
-		else
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
 				break;
 		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..9315ad3abd 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -85,7 +137,7 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
@@ -102,10 +154,48 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
  invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
                      description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
@@ -121,7 +211,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -139,7 +229,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -157,7 +247,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -175,7 +265,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -193,7 +283,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_i
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -211,7 +301,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..8ad5290f4c 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -87,11 +115,47 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
 select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;

#41

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#40)

Re: speed up verifying UTF-8

I wrote:

Also, if SSE is accepted into the tree, then the C fallback is only

important on platforms like PowerPC64 and Arm64, so we can make the
tradeoff by testing those more carefully. I'll test on PowerPC soon.

I got around to testing on POWER8 / Linux / gcc 4.8.5 and found a
regression in the mixed2 case in v11. v12 improves that at the cost of some
improvement in the ascii case (5x vs. 8x).

master:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
2966 | 1525 | 871 | 1474

v11-0001:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
1030 | 644 | 102 | 1760

v12-0001:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
977 | 632 | 168 | 1113

--
John Naylor
EDB: http://www.enterprisedb.com

#42

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#41)

1 attachment(s)

Re: speed up verifying UTF-8

I still wasn't quite happy with the churn in the regression tests, so for
v13 I gave up on using both the existing utf8 table and my new one for the
"padded input" tests, and instead just copied the NUL byte test into the
new table. Also added a primary key to make sure the padded test won't give
weird results if a new entry has a duplicate description.

I came up with "highbit_carry" as a more descriptive variable name than
"x", but that doesn't matter a whole lot.

It also occurred to me that if we're going to check one 8-byte chunk at a
time (like v12 does), maybe it's only worth it to load 8 bytes at a time.
An earlier version did this, but without the recent tweaks. The worst-case
scenario now might be different from the one with 16-bytes, but for now
just tested the previous worst case (mixed2). Only tested on ppc64le, since
I'm hoping x86 will get the SIMD algorithm (I'm holding off rebasing 0002
until 0001 settles down).

Power8, Linux, gcc 4.8

master:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
2952 | 1520 | 871 | 1473

v11:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
1015 | 641 | 102 | 1636

v12:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
964 | 629 | 168 | 1069

v13:
chinese | mixed | ascii | mixed2
---------+-------+-------+--------
954 | 643 | 202 | 1046

v13 is not that much different from v12, but has the nice property of
simpler code. Both are not as nice as v11 for ascii, but don't regress for
the latter's worst case. I'm leaning towards v13 for the fallback.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v13-0001-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v13-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From 1fc9c36c9decabfece7d4fddc5a32db191452e16 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Tue, 29 Jun 2021 07:06:54 -0400
Subject: [PATCH v13] Rewrite pg_utf8_verifystr() for speed

Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(),
rewrite this function in a manner loosely based on the fallback that
is part of the simdjson library.

Verifying multibyte UTF-8 text is modestly faster, but the biggest
improvement is in verifying ASCII, which is now more than 4 times
faster.
---
 src/common/wchar.c                       | 129 ++++++++++++++++++++++-
 src/test/regress/expected/conversion.out |  85 +++++++++++++++
 src/test/regress/sql/conversion.sql      |  57 ++++++++++
 3 files changed, 268 insertions(+), 3 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..1607a421e0 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,51 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1761,24 +1806,102 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
 	while (len > 0)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
 				break;
 			l = 1;
 		}
-		else
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
 				break;
 		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

#43

Heikki Linnakangas

hlinnaka@iki.fi

over 4 years ago

In reply to: John Naylor (#42)

1 attachment(s)

Re: speed up verifying UTF-8

On 29/06/2021 14:20, John Naylor wrote:

I still wasn't quite happy with the churn in the regression tests, so
for v13 I gave up on using both the existing utf8 table and my new one
for the "padded input" tests, and instead just copied the NUL byte test
into the new table. Also added a primary key to make sure the padded
test won't give weird results if a new entry has a duplicate description.

I came up with "highbit_carry" as a more descriptive variable name than
"x", but that doesn't matter a whole lot.

It also occurred to me that if we're going to check one 8-byte chunk at
a time (like v12 does), maybe it's only worth it to load 8 bytes at a
time. An earlier version did this, but without the recent tweaks. The
worst-case scenario now might be different from the one with 16-bytes,
but for now just tested the previous worst case (mixed2).

I tested the new worst case scenario on my laptop:

gcc master:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1311 | 758 | 405 | 583 | 725

gcc v13:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
956 | 472 | 160 | 572 | 939

mixed16 is the same as "mixed2" in the previous rounds, with
'123456789012345ä' as the repeating string, and mixed8 uses '1234567ä',
which I believe is the worst case for patch v13. So v13 is somewhat
slower than master in the worst case.

Hmm, there's one more simple trick we can do: We can have a separate
fast-path version of the loop when there are at least 8 bytes of input
left, skipping all the length checks. With that:

gcc v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
737 | 412 | 94 | 476 | 725

All the above numbers were with gcc 10.2.1. For completeness, with clang
11.0.1-2 I got:

clang master:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1044 | 724 | 403 | 930 | 603
(1 row)

clang v13:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
596 | 445 | 79 | 417 | 715
(1 row)

clang v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
600 | 337 | 93 | 318 | 511

Attached is patch v14 with that optimization. It needs some cleanup, I
just hacked it up quickly for performance testing.

- Heikki

Attachments:

v14-0001-Rewrite-pg_utf8_verifystr-for-speed.patchtext/x-patch; charset=UTF-8; name=v14-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From 731b61ede9fb4c25b1bedeaafb065015704faafd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 30 Jun 2021 14:15:55 +0300
Subject: [PATCH v14 1/1] Rewrite pg_utf8_verifystr() for speed

Based on John Naylor's v13 patch here:

https://www.postgresql.org/message-id/CAFBsxsH9xJpru2U6_ua963LV8LP34%3DbJRaESUTUS1mH6Y-m%2B_g%40mail.gmail.com

I added a separate fast-path version of the loop for when there is at least
8 bytes of input left.
---
 src/common/wchar.c                       | 187 +++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  85 +++++++++++
 src/test/regress/sql/conversion.sql      |  57 +++++++
 3 files changed, 318 insertions(+), 11 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765ba..3ccef6c3cbe 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,51 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1757,32 +1802,152 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/*
+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the
+ * character at *s in bytes, or 0 on invalid input or premature end of input.
+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	/* Found non-ASCII or zero above, so verify a single character. */
+	if (!IS_HIGHBIT_SET(*s))
+	{
+		if (*s == '\0')
+			return 0;
+		l = 1;
+	}
+	/* code points U+0080 through U+07FF */
+	else if (IS_TWO_BYTE_LEAD(*s))
+	{
+		l = 2;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+
+		if (!IS_CONTINUATION_BYTE(b2))
+			return 0;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (b1 < 0xC2)
+			return 0;
+	}
+	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+	else if (IS_THREE_BYTE_LEAD(*s))
+	{
+		l = 3;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3))
+			return 0;
+
+		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+		if (b1 == 0xE0 && b2 < 0xA0)
+			return 0;
+
+		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+		if (b1 == 0xED && b2 > 0x9F)
+			return 0;
+	}
+	/* code points U+010000 through U+10FFFF */
+	else if (IS_FOUR_BYTE_LEAD(*s))
+	{
+		l = 4;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+		b4 = *(s + 3);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3) ||
+			!IS_CONTINUATION_BYTE(b4))
+			return 0;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (b1 == 0xF0 && b2 < 0x90)
+			return 0;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			return 0;
+	}
+	else
+		/* invalid byte */
+		return 0;
+
+	return l;
+}
+
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
-	while (len > 0)
+	/*
+	 * Fast path when we have at least 8 bytes left in the string. We can skip the
+	 * length checks in the loop.
+	 */
+	while (len >= 8)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
+		l = check_ascii(s, 8);
+		if (l)
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+			s += l;
+			len -= l;
+			continue;
 		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character.
+		 * By passing length as constant, the compiler should optimize away
+		 * the length-checks in pg_utf8_verify_one.
+		 */
+		l = pg_utf8_verify_one(s, 8);
+		if (l == 0)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last 7 bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verify_one(s, len);
+		if (l == 0)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba4964..92b5df62c8e 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 83586824321..a3e12961db8 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.30.2

#44

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Heikki Linnakangas (#43)

Re: speed up verifying UTF-8

On Wed, Jun 30, 2021 at 7:18 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

Hmm, there's one more simple trick we can do: We can have a separate
fast-path version of the loop when there are at least 8 bytes of input
left, skipping all the length checks. With that:

Good idea, and the numbers look good on Power8 / gcc 4.8 as well:

master:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2951 | 1521 | 871 | 1473 | 1508

v13:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
949 | 642 | 203 | 1046 | 1818

v14:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
887 | 607 | 179 | 776 | 1325

I don't think the new structuring will pose any challenges for rebasing
0002, either. This might need some experimentation, though:

+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the length
of the
+ * character at *s in bytes, or 0 on invalid input or premature end of
input.
+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)

It seems like it would be easy to have pg_utf8_verify_one in my proposed
pg_utf8.h header and replace the body of pg_utf8_verifychar with it.

--
John Naylor
EDB: http://www.enterprisedb.com

#45

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#44)

2 attachment(s)

Re: speed up verifying UTF-8

I wrote:

I don't think the new structuring will pose any challenges for rebasing

0002, either. This might need some experimentation, though:

+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the

length of the

+ * character at *s in bytes, or 0 on invalid input or premature end of

input.

+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)
It seems like it would be easy to have pg_utf8_verify_one in my proposed

pg_utf8.h header and replace the body of pg_utf8_verifychar with it.

0001: I went ahead and tried this for v15, and also attempted some clean-up:

- Rename pg_utf8_verify_one to pg_utf8_verifychar_internal.
- Have pg_utf8_verifychar_internal return -1 for invalid input to match
other functions in the file. We could also do this for check_ascii, but
it's not quite the same thing, because the string could still have valid
bytes in it, just not enough to advance the pointer by the stride length.
- Remove hard-coded numbers (not wedded to this).

- Use a call to pg_utf8_verifychar in the slow path.
- Reduce pg_utf8_verifychar to thin wrapper around
pg_utf8_verifychar_internal.

The last two aren't strictly necessary, but it prevents bloating the binary
in the slow path, and aids readability. For 0002, this required putting
pg_utf8_verifychar* in src/port. (While writing this I noticed I neglected
to explain that with a comment, though)

Feedback welcome on any of the above.

Since by now it hardly resembles the simdjson (or Fuchsia for that matter)
fallback that it took inspiration from, I've removed that mention from the
commit message.

0002: Just a rebase to work with the above. One possible review point: We
don't really need to have separate control over whether to use special
instructions for CRC and UTF-8. It should probably be just one configure
knob, but having them separate is perhaps easier to review.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v15-0001-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v15-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From cf7756d9a67e6e505343f797bd07c685f8228529 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Mon, 12 Jul 2021 14:58:17 -0400
Subject: [PATCH v15 1/2] Rewrite pg_utf8_verifystr() for speed

Use control flow written from scratch, instead of calling out to
pg_utf8_verifychar() and pg_utf8_isvalid(). Also include a fastpath
to check ASCII eight bytes at a time.

Depending on platform, verifying UTF-8 text containing three-byte
codepoints is 2-3 times faster, and verifying ASCII is now 2.5 to
5 times faster.
---
 src/common/wchar.c                       | 186 ++++++++++++++++++++---
 src/test/regress/expected/conversion.out |  85 +++++++++++
 src/test/regress/sql/conversion.sql      |  57 +++++++
 3 files changed, 305 insertions(+), 23 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..a4d30f45cd 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,51 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1728,61 +1773,156 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
-static int
-pg_utf8_verifychar(const unsigned char *s, int len)
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const unsigned char *s, int len)
 {
 	int			l;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
-	if ((*s & 0x80) == 0)
+	if (!IS_HIGHBIT_SET(*s))
 	{
 		if (*s == '\0')
 			return -1;
-		return 1;
+		l = 1;
 	}
-	else if ((*s & 0xe0) == 0xc0)
+	/* code points U+0080 through U+07FF */
+	else if (IS_TWO_BYTE_LEAD(*s))
+	{
 		l = 2;
-	else if ((*s & 0xf0) == 0xe0)
+		if (len < l)
+			return -1;
+
+		b1 = *s;
+		b2 = *(s + 1);
+
+		if (!IS_CONTINUATION_BYTE(b2))
+			return -1;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (b1 < 0xC2)
+			return -1;
+	}
+	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+	else if (IS_THREE_BYTE_LEAD(*s))
+	{
 		l = 3;
-	else if ((*s & 0xf8) == 0xf0)
+		if (len < l)
+			return -1;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3))
+			return -1;
+
+		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+		if (b1 == 0xE0 && b2 < 0xA0)
+			return -1;
+
+		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+		if (b1 == 0xED && b2 > 0x9F)
+			return -1;
+	}
+	/* code points U+010000 through U+10FFFF */
+	else if (IS_FOUR_BYTE_LEAD(*s))
+	{
 		l = 4;
-	else
-		l = 1;
+		if (len < l)
+			return -1;
 
-	if (l > len)
-		return -1;
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+		b4 = *(s + 3);
 
-	if (!pg_utf8_islegal(s, l))
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3) ||
+			!IS_CONTINUATION_BYTE(b4))
+			return -1;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (b1 == 0xF0 && b2 < 0x90)
+			return -1;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			return -1;
+	}
+	else
+		/* invalid byte */
 		return -1;
 
 	return l;
 }
 
+static int
+pg_utf8_verifychar(const unsigned char *s, int len)
+{
+	return pg_utf8_verifychar_internal(s, len);
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
-	while (len > 0)
+	/*
+	 * Fast path for when we have enough bytes left in the string to give
+	 * check_ascii() a chance to advance the pointer. This also allows the
+	 * functions in this loop to skip length checks.
+	 */
+	while (len >= sizeof(uint64))
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
+		l = check_ascii(s, sizeof(uint64));
+		if (l)
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+			s += l;
+			len -= l;
+			continue;
 		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character. By
+		 * passing length as constant, the compiler should optimize away the
+		 * length-checks in pg_utf8_verifychar_internal.
+		 */
+		l = pg_utf8_verifychar_internal(s, sizeof(uint64));
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last few bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

v15-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchapplication/octet-stream; name=v15-0002-Use-SSE-instructions-for-pg_utf8_verifystr-where.patchDownload

From 09b4ecd79bc93f80b05d9dad293b0c76cae872f6 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Mon, 12 Jul 2021 15:37:49 -0400
Subject: [PATCH v15 2/2] Use SSE instructions for pg_utf8_verifystr() where
 available

On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), which is the reference implementation for the
above algorithm. The code is written from scratch with only some naming
conventions adopted from simdjson.
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 114 +++--
 configure.ac                             |  61 ++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       | 196 +--------
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_utf8.h               | 185 +++++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              |  81 ++++
 src/port/pg_utf8_sse42.c                 | 508 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++
 src/test/regress/expected/conversion.out |  28 ++
 src/test/regress/sql/conversion.sql      |  25 ++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 15 files changed, 1083 insertions(+), 236 deletions(-)
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index e468def49e..68c47b172f 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 39666f9727..da5113f312 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,51 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    # the CPUID instruction is needed for the runtime check.
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8f05840821..f54433933b 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4d30f45cd..ccaad5f03f 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,53 +13,9 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
-/* for UTF-8 */
-#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
-#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
-#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
-#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
-
-/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
-static inline int
-check_ascii(const unsigned char *s, int len)
-{
-	uint64		chunk,
-				highbits_set,
-				highbit_carry;
-
-	if (len >= sizeof(uint64))
-	{
-		memcpy(&chunk, s, sizeof(uint64));
-
-		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = chunk & UINT64CONST(0x8080808080808080);
-		if (highbits_set)
-			return 0;
-
-		/*
-		 * Check if there are any zero bytes in this chunk.
-		 *
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. We already checked that none of the bytes had
-		 * the high bit set previously, so the max value each byte can have
-		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
-		 * worry about carrying over to the next byte.
-		 */
-		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
-
-		/* Then check that the high bit is set in each byte. */
-		highbit_carry &= UINT64CONST(0x8080808080808080);
-		if (highbit_carry == UINT64CONST(0x8080808080808080))
-			return sizeof(uint64);
-		else
-			return 0;
-	}
-	else
-		return 0;
-}
-
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1773,157 +1729,11 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
-/*
- * Workhorse for pg_utf8_verifychar(). Returns the length of the character
- * at *s in bytes, or -1 on invalid input or premature end of input.
- * Static inline for the benefit of pg_utf8_verifystr().
- */
-static inline int
-pg_utf8_verifychar_internal(const unsigned char *s, int len)
-{
-	int			l;
-	unsigned char b1,
-				b2,
-				b3,
-				b4;
-
-	if (!IS_HIGHBIT_SET(*s))
-	{
-		if (*s == '\0')
-			return -1;
-		l = 1;
-	}
-	/* code points U+0080 through U+07FF */
-	else if (IS_TWO_BYTE_LEAD(*s))
-	{
-		l = 2;
-		if (len < l)
-			return -1;
-
-		b1 = *s;
-		b2 = *(s + 1);
-
-		if (!IS_CONTINUATION_BYTE(b2))
-			return -1;
-
-		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-		if (b1 < 0xC2)
-			return -1;
-	}
-	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-	else if (IS_THREE_BYTE_LEAD(*s))
-	{
-		l = 3;
-		if (len < l)
-			return -1;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3))
-			return -1;
-
-		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-		if (b1 == 0xE0 && b2 < 0xA0)
-			return -1;
-
-		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-		if (b1 == 0xED && b2 > 0x9F)
-			return -1;
-	}
-	/* code points U+010000 through U+10FFFF */
-	else if (IS_FOUR_BYTE_LEAD(*s))
-	{
-		l = 4;
-		if (len < l)
-			return -1;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-		b4 = *(s + 3);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3) ||
-			!IS_CONTINUATION_BYTE(b4))
-			return -1;
-
-		/*
-		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
-		 */
-		if (b1 == 0xF0 && b2 < 0x90)
-			return -1;
-
-		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
-		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
-			return -1;
-	}
-	else
-		/* invalid byte */
-		return -1;
-
-	return l;
-}
-
-static int
-pg_utf8_verifychar(const unsigned char *s, int len)
-{
-	return pg_utf8_verifychar_internal(s, len);
-}
-
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	/*
-	 * Fast path for when we have enough bytes left in the string to give
-	 * check_ascii() a chance to advance the pointer. This also allows the
-	 * functions in this loop to skip length checks.
-	 */
-	while (len >= sizeof(uint64))
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, sizeof(uint64));
-		if (l)
-		{
-			s += l;
-			len -= l;
-			continue;
-		}
-
-		/*
-		 * Found non-ASCII or zero above, so verify a single character. By
-		 * passing length as constant, the compiler should optimize away the
-		 * length-checks in pg_utf8_verifychar_internal.
-		 */
-		l = pg_utf8_verifychar_internal(s, sizeof(uint64));
-		if (l == -1)
-			goto end;
-
-		s += l;
-		len -= l;
-	}
-
-	/* Slow path to handle the last few bytes in the string */
-	while (len > 0)
-	{
-		int			l;
-
-		l = pg_utf8_verifychar(s, len);
-		if (l == -1)
-			goto end;
-
-		s += l;
-		len -= l;
-	}
-
-end:
-	return s - start;
+	/* platform-specific implementation in src/port */
+	return UTF8_VERIFYSTR(s, len);
 }
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 783b8fc1ba..1aa839d258 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -926,6 +926,15 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..dc38369a31
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,185 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int	pg_utf8_verifychar(const unsigned char *s, int len);
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
+/*
+ * Workhorse for src/common/pg_utf8_verifychar(). Returns the length of
+ * the character at *s in bytes, or -1 on invalid input or premature end
+ * of input. Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	if (!IS_HIGHBIT_SET(*s))
+	{
+		if (*s == '\0')
+			return -1;
+		l = 1;
+	}
+	/* code points U+0080 through U+07FF */
+	else if (IS_TWO_BYTE_LEAD(*s))
+	{
+		l = 2;
+		if (len < l)
+			return -1;
+
+		b1 = *s;
+		b2 = *(s + 1);
+
+		if (!IS_CONTINUATION_BYTE(b2))
+			return -1;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (b1 < 0xC2)
+			return -1;
+	}
+	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+	else if (IS_THREE_BYTE_LEAD(*s))
+	{
+		l = 3;
+		if (len < l)
+			return -1;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3))
+			return -1;
+
+		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+		if (b1 == 0xE0 && b2 < 0xA0)
+			return -1;
+
+		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+		if (b1 == 0xED && b2 > 0x9F)
+			return -1;
+	}
+	/* code points U+010000 through U+10FFFF */
+	else if (IS_FOUR_BYTE_LEAD(*s))
+	{
+		l = 4;
+		if (len < l)
+			return -1;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+		b4 = *(s + 3);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3) ||
+			!IS_CONTINUATION_BYTE(b4))
+			return -1;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (b1 == 0xF0 && b2 < 0x90)
+			return -1;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			return -1;
+	}
+	else
+		/* invalid byte */
+		return -1;
+
+	return l;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..b32cc53212
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+int
+pg_utf8_verifychar(const unsigned char *s, int len)
+{
+	return pg_utf8_verifychar_internal(s, len);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+
+	/*
+	 * Fast path for when we have enough bytes left in the string to give
+	 * check_ascii() a chance to advance the pointer. This also allows the
+	 * functions in this loop to skip length checks.
+	 */
+	while (len >= sizeof(uint64))
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, sizeof(uint64));
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character. By
+		 * passing length as constant, the compiler should optimize away the
+		 * length-checks in pg_utf8_verifychar_internal.
+		 */
+		l = pg_utf8_verifychar_internal(s, sizeof(uint64));
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last few bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+end:
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..cd050ec2bf
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,508 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes from the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	const int	orig_len = len;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	/*
+	 * NB: This check must be strictly greater-than, otherwise an invalid byte
+	 * at the end might not get detected.
+	 */
+	while (len > sizeof(__m128i))
+	{
+		input = vload(s);
+
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII, since the error is cumulative.
+		 */
+		if (!is_highbit_set(input))
+			error = bitwise_or(error, prev_incomplete);
+		else
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+
+		prev = input;
+		s += sizeof(__m128i);
+		len -= sizeof(__m128i);
+	}
+
+	/*
+	 * If we saw an error any time during the loop, start over with the
+	 * fallback so we can return the number of valid bytes.
+	 */
+	if (to_bool(error))
+		return pg_validate_utf8_fallback(start, orig_len);
+	else
+	{
+		/*
+		 * For short sequences, just use the fallback. For the last few bytes
+		 * of a longer sequence, we walk backwards into the previous chunk,
+		 * find the last byte that could have been the start of a valid
+		 * character, and start the fallback from there.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+				IS_TWO_BYTE_LEAD(*s) ||
+				IS_THREE_BYTE_LEAD(*s) ||
+				IS_FOUR_BYTE_LEAD(*s))
+				break;
+		}
+		return orig_len - len + pg_validate_utf8_fallback(s, len);
+	}
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+
+#else
+	return false;
+#endif							/* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 92b5df62c8..908b8adc38 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -157,6 +157,34 @@ order by description;
 -------------+------------+---------------------
 (0 rows)
 
+-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more than 16 bytes
+-- of ASCII.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index a3e12961db..9df151c5d7 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -131,6 +131,31 @@ using (description)
 where p.error is distinct from b.error
 order by description;
 
+-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more than 16 bytes
+-- of ASCII.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 233ddbf4c2..9b8bad9044 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index fcb43b0ca0..39ba39b23c 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -499,6 +499,9 @@ sub GenerateFiles
 		USE_NAMED_POSIX_SEMAPHORES => undef,
 		USE_OPENSSL                => undef,
 		USE_PAM                    => undef,
+		USE_FALLBACK_UTF8          => undef,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
-- 
2.31.1

#46

Amit Khandekar

amitdkhan.pg@gmail.com

over 4 years ago

In reply to: John Naylor (#45)

Re: speed up verifying UTF-8

On Tue, 13 Jul 2021 at 01:15, John Naylor <john.naylor@enterprisedb.com> wrote:

It seems like it would be easy to have pg_utf8_verify_one in my proposed pg_utf8.h header and replace the body of pg_utf8_verifychar with it.

0001: I went ahead and tried this for v15, and also attempted some clean-up:

- Rename pg_utf8_verify_one to pg_utf8_verifychar_internal.
- Have pg_utf8_verifychar_internal return -1 for invalid input to match other functions in the file. We could also do this for check_ascii, but it's not quite the same thing, because the string could still have valid bytes in it, just not enough to advance the pointer by the stride length.
- Remove hard-coded numbers (not wedded to this).

- Use a call to pg_utf8_verifychar in the slow path.
- Reduce pg_utf8_verifychar to thin wrapper around pg_utf8_verifychar_internal.

- check_ascii() seems to be used only for 64-bit chunks. So why not
remove the len argument and the len <= sizeof(int64) checks inside the
function. We can rename it to check_ascii64() for clarity.

- I was thinking, why not have a pg_utf8_verify64() that processes
64-bit chunks (or a 32-bit version). In check_ascii(), we anyway
extract a 64-bit chunk from the string. We can use the same chunk to
extract the required bits from a two byte char or a 4 byte char. This
way we can avoid extraction of separate bytes like b1 = *s; b2 = s[1]
etc. More importantly, we can avoid the separate continuation-char
checks for each individual byte. Additionally, we can try to simplify
the subsequent overlong or surrogate char checks. Something like this
:

int pg_utf8_verifychar_32(uint32 chunk)
{
int len, l;

for (len = sizeof(chunk); len > 0; (len -= l), (chunk = chunk << l))
{
/* Is 2-byte lead */
if ((chunk & 0xF0000000) == 0xC0000000)
{
l = 2;
/* ....... ....... */
}
/* Is 3-byte lead */
else if ((chunk & 0xF0000000) == 0xE0000000)
{
l = 3;
if (len < l)
break;

/* b2 and b3 should be continuation bytes */
if ((chunk & 0x00C0C000) != 0x00808000)
return sizeof(chunk) - len;

switch (chunk & 0xFF200000)
{
/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx
* i.e. (b1 == 0xE0 && b2 < 0xA0). We already know b2
is of the form
* 10xx since it's a continuation char. Additionally
condition b2 <=
* 0x9F means it is of the form 100x.xxxx. i.e.
either 1000.xxxx
* or 1001.xxxx. So just verify that it is xx0x.xxxx
*/
case 0xE0000000:
return sizeof(chunk) - len;

/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx
* i.e. (b1 == 0xED && b2 > 0x9F): Here, > 0x9F means either
* 1010.xxxx, 1011.xxxx, 1100.xxxx, or 1110.xxxx. Last
two are not
* possible because b2 is a continuation char. So it has to be
* first two. So just verify that it is xx1x.xxxx
*/
case 0xED200000:
return sizeof(chunk) - len;
default:
;
}

}
/* Is 4-byte lead */
else if ((chunk & 0xF0000000) == 0xF0000000)
{
/* ......... */
l = 4;
}
else
return sizeof(chunk) - len;
}
return sizeof(chunk) - len;
}

#47

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Amit Khandekar (#46)

2 attachment(s)

Re: speed up verifying UTF-8

On Thu, Jul 15, 2021 at 1:10 AM Amit Khandekar <amitdkhan.pg@gmail.com>
wrote:

- check_ascii() seems to be used only for 64-bit chunks. So why not
remove the len argument and the len <= sizeof(int64) checks inside the
function. We can rename it to check_ascii64() for clarity.

Thanks for taking a look!

Well yes, but there's nothing so intrinsic to 64 bits that the name needs
to reflect that. Earlier versions worked on 16 bytes at time. The compiler
will optimize away the len check, but we could replace with an assert
instead.

- I was thinking, why not have a pg_utf8_verify64() that processes
64-bit chunks (or a 32-bit version). In check_ascii(), we anyway
extract a 64-bit chunk from the string. We can use the same chunk to
extract the required bits from a two byte char or a 4 byte char. This
way we can avoid extraction of separate bytes like b1 = *s; b2 = s[1]
etc.

Loading bytes from L1 is really fast -- I wouldn't even call it
"extraction".

More importantly, we can avoid the separate continuation-char
checks for each individual byte.

On a pipelined superscalar CPU, I wouldn't expect it to matter in the
slightest.

Additionally, we can try to simplify
the subsequent overlong or surrogate char checks. Something like this

My recent experience with itemptrs has made me skeptical of this kind of
thing, but the idea was interesting enough that I couldn't resist trying it
out. I have two attempts, which are attached as v16*.txt and apply
independently. They are rough, and some comments are now lies. To simplify
the constants, I do shift down to uint32, and I didn't bother working
around that. v16alpha regressed on worst-case input, so for v16beta I went
back to earlier coding for the one-byte ascii check. That helped, but it's
still slower than v14.

That was not unexpected, but I was mildly shocked to find out that v15 is
also slower than the v14 that Heikki posted. The only non-cosmetic
difference is using pg_utf8_verifychar_internal within pg_utf8_verifychar.
I'm not sure why it would make such a big difference here. The numbers on
Power8 / gcc 4.8 (little endian):

HEAD:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2951 | 1521 | 871 | 1474 | 1508

v14:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
885 | 607 | 179 | 774 | 1325

v15:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1085 | 671 | 180 | 1032 | 1799

v16alpha:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1268 | 822 | 180 | 1410 | 2518

v16beta:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1096 | 654 | 182 | 814 | 1403

As it stands now, for v17 I'm inclined to go back to v15, but without the
attempt at being clever that seems to have slowed it down from v14.

Any interest in testing on 64-bit Arm?

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v16alpha-Rewrite-pg_utf8_verifystr-for-speed.txttext/plain; charset=US-ASCII; name=v16alpha-Rewrite-pg_utf8_verifystr-for-speed.txtDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..177c607be6 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,8 +13,47 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const uint64 chunk)
+{
+	uint64
+				highbits_set,
+				highbit_carry;
+
+	/* Check if any bytes in this chunk have the high bit set. */
+	highbits_set = chunk & UINT64CONST(0x8080808080808080);
+	if (highbits_set)
+		return 0;
+
+	/*
+	 * Check if there are any zero bytes in this chunk.
+	 *
+	 * First, add 0x7f to each byte. This sets the high bit in each byte,
+	 * unless it was a zero. We already checked that none of the bytes had the
+	 * high bit set previously, so the max value each byte can have after the
+	 * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+	 * carrying over to the next byte.
+	 */
+	highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+	/* Then check that the high bit is set in each byte. */
+	highbit_carry &= UINT64CONST(0x8080808080808080);
+	if (highbit_carry == UINT64CONST(0x8080808080808080))
+		return sizeof(chunk);
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1728,6 +1767,97 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const uint64 chunk_orig)
+{
+	const int32 chunk = (pg_hton64(chunk_orig) >> 32);
+
+	/* high bit not set */
+	if ((chunk & 0x80000000) == 0)
+	{
+		/* is the first byte zero? */
+		if (chunk < 0x01000000)
+			return -1;
+
+		/* found valid ASCII */
+		return 1;
+	}
+	/* 2-byte lead */
+	else if ((chunk & 0xE0000000) == 0xC0000000)
+	{
+		/* check second byte for continuation */
+		if ((chunk & 0x00C00000) != 0x00800000)
+			return -1;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (chunk < 0xC2000000)
+			return -1;
+
+		/* found valid sequence for code points U+0080 through U+07FF */
+		return 2;
+	}
+	/* 3-byte lead */
+	else if ((chunk & 0xF0000000) == 0xE0000000)
+	{
+		/* second and third bytes should be continuation bytes */
+		if ((chunk & 0x00C0C000) != 0x00808000)
+			return -1;
+
+		/*
+		 * Both of these cases only have one possibilty for the first byte, so
+		 * we don't alter it. The cases have different byte patterns for the
+		 * second byte, but we check the same bit position: 10?x.xxxx
+		 */
+		switch (chunk & 0xFF200000)
+		{
+				/* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+			case 0xE0000000:
+				return -1;
+
+				/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			case 0xED200000:
+				return -1;
+
+			default:
+				;
+		}
+
+		/*
+		 * found valid sequence for code points U+0800 through U+D7FF or
+		 * U+E000 through U+FFFF
+		 */
+		return 3;
+	}
+	/* is 4-byte lead */
+	else if ((chunk & 0xF8000000) == 0xF0000000)
+	{
+		/* second through fourth bytes should be continuation bytes */
+		if ((chunk & 0x00C0C0C0) != 0x00808080)
+			return -1;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (chunk < 0xF0900000)
+			return -1;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if (chunk > 0xF48FBFBF)
+			return -1;
+
+		/* found valid sequence for code points U+010000 through U+10FFFF */
+		return 4;
+	}
+	else
+		/* invalid byte */
+		return -1;
+}
+
 static int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
@@ -1761,28 +1891,55 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		chunk;
 
-	while (len > 0)
+	/*
+	 * Fast path for when we have enough bytes left in the string to give
+	 * check_ascii() a chance to advance the pointer. This also allows the
+	 * functions in this loop to skip length checks.
+	 */
+	while (len >= sizeof(chunk))
 	{
 		int			l;
 
+		memcpy(&chunk, s, sizeof(chunk));
+
 		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
+		l = check_ascii(chunk);
+		if (l)
 		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+			s += l;
+			len -= l;
+			continue;
 		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character. By
+		 * passing length as constant, the compiler should optimize away the
+		 * length-checks in pg_utf8_verifychar_internal.
+		 */
+		l = pg_utf8_verifychar_internal(chunk);
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last few bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),

v16beta-Rewrite-pg_utf8_verifystr-for-speed.txttext/plain; charset=US-ASCII; name=v16beta-Rewrite-pg_utf8_verifystr-for-speed.txtDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..a484b97b0b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,8 +13,47 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const uint64 chunk)
+{
+	uint64
+				highbits_set,
+				highbit_carry;
+
+	/* Check if any bytes in this chunk have the high bit set. */
+	highbits_set = chunk & UINT64CONST(0x8080808080808080);
+	if (highbits_set)
+		return 0;
+
+	/*
+	 * Check if there are any zero bytes in this chunk.
+	 *
+	 * First, add 0x7f to each byte. This sets the high bit in each byte,
+	 * unless it was a zero. We already checked that none of the bytes had the
+	 * high bit set previously, so the max value each byte can have after the
+	 * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+	 * carrying over to the next byte.
+	 */
+	highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+	/* Then check that the high bit is set in each byte. */
+	highbit_carry &= UINT64CONST(0x8080808080808080);
+	if (highbit_carry == UINT64CONST(0x8080808080808080))
+		return sizeof(chunk);
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1728,6 +1767,89 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const uint64 chunk_orig)
+{
+	const int32 chunk = (pg_hton64(chunk_orig) >> 32);
+
+	/* high bit should be set */
+	Assert((chunk & 0x80000000) != 0);
+
+	/* 2-byte lead */
+	if ((chunk & 0xE0000000) == 0xC0000000)
+	{
+		/* check second byte for continuation */
+		if ((chunk & 0x00C00000) != 0x00800000)
+			return -1;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (chunk < 0xC2000000)
+			return -1;
+
+		/* found valid sequence for code points U+0080 through U+07FF */
+		return 2;
+	}
+	/* 3-byte lead */
+	else if ((chunk & 0xF0000000) == 0xE0000000)
+	{
+		/* second and third bytes should be continuation bytes */
+		if ((chunk & 0x00C0C000) != 0x00808000)
+			return -1;
+
+		/*
+		 * Both of these cases only have one possibilty for the first byte, so
+		 * we don't alter it. The cases have different byte patterns for the
+		 * second byte, but we check the same bit position: 10?x.xxxx
+		 */
+		switch (chunk & 0xFF200000)
+		{
+				/* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+			case 0xE0000000:
+				return -1;
+
+				/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			case 0xED200000:
+				return -1;
+
+			default:
+				/*
+				 * found valid sequence for code points U+0800 through U+D7FF or
+				 * U+E000 through U+FFFF
+				 */
+				return 3;
+
+		}
+	}
+	/* 4-byte lead */
+	else if ((chunk & 0xF8000000) == 0xF0000000)
+	{
+		/* second through fourth bytes should be continuation bytes */
+		if ((chunk & 0x00C0C0C0) != 0x00808080)
+			return -1;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (chunk < 0xF0900000)
+			return -1;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if (chunk > 0xF48FBFBF)
+			return -1;
+
+		/* found valid sequence for code points U+010000 through U+10FFFF */
+		return 4;
+	}
+	else
+		/* invalid byte */
+		return -1;
+}
+
 static int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
@@ -1761,28 +1883,62 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		chunk;
 
-	while (len > 0)
+	/*
+	 * Fast path for when we have enough bytes left in the string to give
+	 * check_ascii() a chance to advance the pointer. This also allows the
+	 * functions in this loop to skip length checks.
+	 */
+	while (len >= sizeof(chunk))
 	{
 		int			l;
 
+		memcpy(&chunk, s, sizeof(chunk));
+
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(chunk);
+		if (l)
+			goto advance;
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character. First check the first byte for ASCII.
+		 */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+				goto end;
+			else
+			{
+				l = 1;
+				goto advance;
+			}
 		}
+
+		/* Check for valid multibyte input. Since we already have the integer chunk, use that here as well. */
+		l = pg_utf8_verifychar_internal(chunk);
+		if (l == -1)
+			goto end;
+
+advance:
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last few bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),

#48

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#47)

1 attachment(s)

Re: speed up verifying UTF-8

I wrote:

To simplify the constants, I do shift down to uint32, and I didn't bother

working around that. v16alpha regressed on worst-case input, so for v16beta
I went back to earlier coding for the one-byte ascii check. That helped,
but it's still slower than v14.

It occurred to me that I could rewrite the switch test into simple
comparisons, like I already had for the 2- and 4-byte lead cases. While at
it, I folded the leading byte and continuation tests into a single
operation, like this:

/* 3-byte lead with two continuation bytes */
else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)

...and also tried using 64-bit constants to avoid shifting. Still didn't
quite beat v14, but got pretty close:

The numbers on Power8 / gcc 4.8 (little endian):

HEAD:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2951 | 1521 | 871 | 1474 | 1508

v14:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
885 | 607 | 179 | 774 | 1325

v16gamma:

chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
952 | 632 | 180 | 800 | 1333

A big-endian 64-bit platform just might shave enough cycles to beat v14
this way... or not.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v16gamma-Rewrite-pg_utf8_verifystr-for-speed.txttext/plain; charset=US-ASCII; name=v16gamma-Rewrite-pg_utf8_verifystr-for-speed.txtDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..f48d79638c 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,8 +13,41 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const uint64 chunk)
+{
+	uint64
+				highbits_set,
+				highbit_carry;
+
+	/* Check if any bytes in this chunk have the high bit set. */
+	highbits_set = chunk & UINT64CONST(0x8080808080808080);
+	if (highbits_set)
+		return 0;
+
+	/*
+	 * Check if there are any zero bytes in this chunk.
+	 *
+	 * First, add 0x7f to each byte. This sets the high bit in each byte,
+	 * unless it was a zero. We already checked that none of the bytes had the
+	 * high bit set previously, so the max value each byte can have after the
+	 * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+	 * carrying over to the next byte.
+	 */
+	highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+	/* Then check that the high bit is set in each byte. */
+	highbit_carry &= UINT64CONST(0x8080808080808080);
+	if (highbit_carry == UINT64CONST(0x8080808080808080))
+		return sizeof(chunk);
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1728,6 +1761,67 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const uint64 chunk_orig)
+{
+	const uint64 chunk = (pg_hton64(chunk_orig));
+
+	/* high bit should be set */
+	Assert((chunk & 0x8000000000000000) != 0);
+
+	/* 2-byte lead with one continuation byte */
+	if ((chunk & 0xE0C0000000000000) == 0xC080000000000000)
+	{
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (chunk < 0xC200000000000000)
+			return -1;
+
+		/* found valid sequence for code points U+0080 through U+07FF */
+		return 2;
+	}
+	/* 3-byte lead with two continuation bytes */
+	else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
+	{
+		/* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+		if (chunk < 0xE0A0000000000000)
+			return -1;
+
+		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+		if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000)
+			return -1;
+
+		/*
+		 * found valid sequence for code points U+0800 through U+D7FF or
+		 * U+E000 through U+FFFF
+		 */
+		return 3;
+	}
+	/* 4-byte lead with three continuation bytes */
+	else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000)
+	{
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (chunk < 0xF090000000000000)
+			return -1;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if (chunk > 0xF48FBFBFffffffff)
+			return -1;
+
+		/* found valid sequence for code points U+010000 through U+10FFFF */
+		return 4;
+	}
+	else
+		/* invalid byte */
+		return -1;
+}
+
 static int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
@@ -1761,28 +1855,62 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		chunk;
 
-	while (len > 0)
+	/*
+	 * Fast path for when we have enough bytes left in the string to give
+	 * check_ascii() a chance to advance the pointer. This also allows the
+	 * functions in this loop to skip length checks.
+	 */
+	while (len >= sizeof(chunk))
 	{
 		int			l;
 
+		memcpy(&chunk, s, sizeof(chunk));
+
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(chunk);
+		if (l)
+			goto advance;
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character. First check the first byte for ASCII.
+		 */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+				goto end;
+			else
+			{
+				l = 1;
+				goto advance;
+			}
 		}
+
+		/* Check for valid multibyte input. Since we already have the integer chunk, use that here as well. */
+		l = pg_utf8_verifychar_internal(chunk);
+		if (l == -1)
+			goto end;
+
+advance:
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last few bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),

#49

Vladimir Sitnikov

sitnikov.vladimir@gmail.com

over 4 years ago

In reply to: John Naylor (#48)

Re: speed up verifying UTF-8

Have you considered shift-based DFA for a portable implementation
https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 ?

Vladimir

Show quoted text

#50

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Vladimir Sitnikov (#49)

Re: speed up verifying UTF-8

On Fri, Jul 16, 2021 at 1:44 AM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

Have you considered shift-based DFA for a portable implementation

https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 ?

I did consider some kind of DFA a while back and it was too slow.

--
John Naylor
EDB: http://www.enterprisedb.com

#51

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#50)

Re: speed up verifying UTF-8

My v16 experimental patches were a bit messy, so I've organized an
experimental series that applies cumulatively, to try to trace the effects
of various things.

v17-0001 is the same as v14. 0002 is a stripped-down implementation of
Amit's chunk idea for multibyte, and it's pretty good on x86. On Power8,
not so much. 0003 and 0004 are shot-in-the-dark guesses to improve it on
Power8, with some success, but end up making x86 weirdly slow, so I'm
afraid that could happen on other platforms as well.

v14 still looks like the safe bet for now. It also has the advantage of
using the same function both in and out of the fastpath, which will come in
handy when moving it to src/port as the fallback for SSE.

Power8, gcc 4.8:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2944 | 1523 | 871 | 1473 | 1509

v17-0001:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
888 | 607 | 179 | 777 | 1328

v17-0002:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1017 | 718 | 156 | 1213 | 2138

v17-0003:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1205 | 662 | 180 | 767 | 1256

v17-0004:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1085 | 660 | 224 | 868 | 1369

Macbook x86, clang 12:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
974 | 691 | 370 | 456 | 526

v17-0001:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
674 | 346 | 78 | 309 | 504

v17-0002:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
516 | 324 | 78 | 331 | 544

v17-0003:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
621 | 537 | 323 | 413 | 602

v17-0004:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
576 | 439 | 154 | 557 | 915

--
John Naylor
EDB: http://www.enterprisedb.com

#52

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#51)

4 attachment(s)

Re: speed up verifying UTF-8

Forgot the attachments...

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v17-0001-Rewrite-pg_utf8_verifystr-for-speed.patchapplication/octet-stream; name=v17-0001-Rewrite-pg_utf8_verifystr-for-speed.patchDownload

From a29de3b8b4773f61134812cd0aba726ae448312f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 30 Jun 2021 14:15:55 +0300
Subject: [PATCH v17 1/2] Rewrite pg_utf8_verifystr() for speed

Based on John Naylor's v13 patch here:

https://www.postgresql.org/message-id/CAFBsxsH9xJpru2U6_ua963LV8LP34%3DbJRaESUTUS1mH6Y-m%2B_g%40mail.gmail.com

I added a separate fast-path version of the loop for when there is at least
8 bytes of input left.
---
 src/common/wchar.c                       | 187 +++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  85 +++++++++++
 src/test/regress/sql/conversion.sql      |  57 +++++++
 3 files changed, 318 insertions(+), 11 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..3ccef6c3cb 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,51 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1757,32 +1802,152 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/*
+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the
+ * character at *s in bytes, or 0 on invalid input or premature end of input.
+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
+
+	/* Found non-ASCII or zero above, so verify a single character. */
+	if (!IS_HIGHBIT_SET(*s))
+	{
+		if (*s == '\0')
+			return 0;
+		l = 1;
+	}
+	/* code points U+0080 through U+07FF */
+	else if (IS_TWO_BYTE_LEAD(*s))
+	{
+		l = 2;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+
+		if (!IS_CONTINUATION_BYTE(b2))
+			return 0;
+
+		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+		if (b1 < 0xC2)
+			return 0;
+	}
+	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+	else if (IS_THREE_BYTE_LEAD(*s))
+	{
+		l = 3;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3))
+			return 0;
+
+		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+		if (b1 == 0xE0 && b2 < 0xA0)
+			return 0;
+
+		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+		if (b1 == 0xED && b2 > 0x9F)
+			return 0;
+	}
+	/* code points U+010000 through U+10FFFF */
+	else if (IS_FOUR_BYTE_LEAD(*s))
+	{
+		l = 4;
+		if (len < l)
+			return 0;
+
+		b1 = *s;
+		b2 = *(s + 1);
+		b3 = *(s + 2);
+		b4 = *(s + 3);
+
+		if (!IS_CONTINUATION_BYTE(b2) ||
+			!IS_CONTINUATION_BYTE(b3) ||
+			!IS_CONTINUATION_BYTE(b4))
+			return 0;
+
+		/*
+		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+		 */
+		if (b1 == 0xF0 && b2 < 0x90)
+			return 0;
+
+		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+			return 0;
+	}
+	else
+		/* invalid byte */
+		return 0;
+
+	return l;
+}
+
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
-	while (len > 0)
+	/*
+	 * Fast path when we have at least 8 bytes left in the string. We can skip the
+	 * length checks in the loop.
+	 */
+	while (len >= 8)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
+		l = check_ascii(s, 8);
+		if (l)
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+			s += l;
+			len -= l;
+			continue;
 		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character.
+		 * By passing length as constant, the compiler should optimize away
+		 * the length-checks in pg_utf8_verify_one.
+		 */
+		l = pg_utf8_verify_one(s, 8);
+		if (l == 0)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* Slow path to handle the last 7 bytes in the string */
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verify_one(s, len);
+		if (l == 0)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

v17-0004-Second-attempt-at-addressing-performance-regress.patchapplication/octet-stream; name=v17-0004-Second-attempt-at-addressing-performance-regress.patchDownload

From 29fcfe4010e58d49cad9c1c725fe2bc73942f672 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 16 Jul 2021 17:45:31 -0400
Subject: [PATCH v17 4/4] Second attempt at addressing performance regressions
 from chunk approach

---
 src/common/wchar.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 79990fe972..3365521c0a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1868,20 +1868,21 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 
 		memcpy(&chunk, s, sizeof(chunk));
 
-		/* fast path for ASCII-subset characters */
-		l = check_ascii(chunk);
-		if (l)
-		{
-			s += l;
-			len -= l;
-			continue;
-		}
-
 		/*  Do a quick check if the first byte is both non-zero and doesn't have the high bit set */
 		if ((signed char) (*s) > 0)
 		{
-			s++;
-			len--;
+			/* fast path for ASCII-subset characters */
+			l = check_ascii(chunk);
+			if (l)
+			{
+				s += l;
+				len -= l;
+			}
+			else
+			{
+				s++;
+				len--;
+			}
 			continue;
 		}
 
-- 
2.31.1

v17-0002-Use-integer-chunk-for-fast-path-multibyte-check.patchapplication/octet-stream; name=v17-0002-Use-integer-chunk-for-fast-path-multibyte-check.patchDownload

From c10bf1271e586c2cdebfb8e05a2dd9533c850d4a Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 16 Jul 2021 18:16:03 -0400
Subject: [PATCH v17 2/2] Use integer chunk for fast path multibyte check

Based on idea from Amit Khandekar:
https://www.postgresql.org/message-id/CAJ3gD9ejC%2BpuY%3DLgco2SGyD4tR46kye7qLZoskW0PXumtLcCpQ%40mail.gmail.com
---
 src/common/wchar.c | 158 ++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 102 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 3ccef6c3cb..ec4bbb3b6a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,49 +13,36 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
-/* for UTF-8 */
-#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
-#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
-#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
-#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
-
 /* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
 static inline int
-check_ascii(const unsigned char *s, int len)
+check_ascii(const uint64 chunk)
 {
-	uint64		chunk,
-				highbits_set,
+	uint64		highbits_set,
 				highbit_carry;
 
-	if (len >= sizeof(uint64))
-	{
-		memcpy(&chunk, s, sizeof(uint64));
-
-		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = chunk & UINT64CONST(0x8080808080808080);
-		if (highbits_set)
-			return 0;
+	/* Check if any bytes in this chunk have the high bit set. */
+	highbits_set = chunk & UINT64CONST(0x8080808080808080);
+	if (highbits_set)
+		return 0;
 
-		/*
-		 * Check if there are any zero bytes in this chunk.
-		 *
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. We already checked that none of the bytes had
-		 * the high bit set previously, so the max value each byte can have
-		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
-		 * worry about carrying over to the next byte.
-		 */
-		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+	/*
+	 * Check if there are any zero bytes in this chunk.
+	 *
+	 * First, add 0x7f to each byte. This sets the high bit in each byte,
+	 * unless it was a zero. We already checked that none of the bytes had the
+	 * high bit set previously, so the max value each byte can have after the
+	 * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+	 * carrying over to the next byte.
+	 */
+	highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
 
-		/* Then check that the high bit is set in each byte. */
-		highbit_carry &= UINT64CONST(0x8080808080808080);
-		if (highbit_carry == UINT64CONST(0x8080808080808080))
-			return sizeof(uint64);
-		else
-			return 0;
-	}
+	/* Then check that the high bit is set in each byte. */
+	highbit_carry &= UINT64CONST(0x8080808080808080);
+	if (highbit_carry == UINT64CONST(0x8080808080808080))
+		return sizeof(chunk);
 	else
 		return 0;
 }
@@ -1804,92 +1791,60 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 
 /*
  * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the
- * character at *s in bytes, or 0 on invalid input or premature end of input.
- *
- * XXX: could this be combined with pg_utf8_verifychar above?
+ * character at the start of the chunk in bytes, or 0 on invalid input or premature
+ * end of input.
  */
 static inline int
-pg_utf8_verify_one(const unsigned char *s, int len)
+pg_utf8_verify_one(const uint64 chunk_orig)
 {
 	int			l;
-	unsigned char b1,
-				b2,
-				b3,
-				b4;
+	const uint64 chunk = (pg_hton64(chunk_orig));
 
-	/* Found non-ASCII or zero above, so verify a single character. */
-	if (!IS_HIGHBIT_SET(*s))
+	/* high bit not set */
+	if ((chunk & 0x8000000000000000) == 0)
 	{
-		if (*s == '\0')
+		/* check first byte for zero */
+		if (chunk < 0x0100000000000000)
 			return 0;
+
 		l = 1;
 	}
-	/* code points U+0080 through U+07FF */
-	else if (IS_TWO_BYTE_LEAD(*s))
+	/* 2-byte lead with one continuation byte */
+	else if ((chunk & 0xE0C0000000000000) == 0xC080000000000000)
 	{
-		l = 2;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-
-		if (!IS_CONTINUATION_BYTE(b2))
-			return 0;
-
 		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-		if (b1 < 0xC2)
+		if (chunk < 0xC200000000000000)
 			return 0;
+
+		l = 2;
 	}
-	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-	else if (IS_THREE_BYTE_LEAD(*s))
+	/* 3-byte lead with two continuation bytes */
+	else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
 	{
-		l = 3;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3))
-			return 0;
-
-		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-		if (b1 == 0xE0 && b2 < 0xA0)
+		/* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+		if (chunk < 0xE0A0000000000000)
 			return 0;
 
 		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-		if (b1 == 0xED && b2 > 0x9F)
+		if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000)
 			return 0;
+
+		l = 3;
 	}
-	/* code points U+010000 through U+10FFFF */
-	else if (IS_FOUR_BYTE_LEAD(*s))
+	/* 4-byte lead with three continuation bytes */
+	else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000)
 	{
-		l = 4;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-		b4 = *(s + 3);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3) ||
-			!IS_CONTINUATION_BYTE(b4))
-			return 0;
-
 		/*
 		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
 		 */
-		if (b1 == 0xF0 && b2 < 0x90)
+		if (chunk < 0xF090000000000000)
 			return 0;
 
 		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
-		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+		if (chunk > 0xF48FBFBFffffffff)
 			return 0;
+
+		l = 4;
 	}
 	else
 		/* invalid byte */
@@ -1898,22 +1853,23 @@ pg_utf8_verify_one(const unsigned char *s, int len)
 	return l;
 }
 
-
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		chunk;
 
 	/*
-	 * Fast path when we have at least 8 bytes left in the string. We can skip the
-	 * length checks in the loop.
+	 * Fast path when we have at least 8 bytes left in the string.
 	 */
 	while (len >= 8)
 	{
 		int			l;
 
+		memcpy(&chunk, s, sizeof(chunk));
+
 		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, 8);
+		l = check_ascii(chunk);
 		if (l)
 		{
 			s += l;
@@ -1923,10 +1879,8 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 
 		/*
 		 * Found non-ASCII or zero above, so verify a single character.
-		 * By passing length as constant, the compiler should optimize away
-		 * the length-checks in pg_utf8_verify_one.
 		 */
-		l = pg_utf8_verify_one(s, 8);
+		l = pg_utf8_verify_one(chunk);
 		if (l == 0)
 			goto end;
 
@@ -1939,8 +1893,8 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	{
 		int			l;
 
-		l = pg_utf8_verify_one(s, len);
-		if (l == 0)
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
 			goto end;
 
 		s += l;
-- 
2.31.1

v17-0003-First-attempt-at-addressing-performance-regressi.patchapplication/octet-stream; name=v17-0003-First-attempt-at-addressing-performance-regressi.patchDownload

From 340dbdb625425a4018e51a3bd5d91b54044be4c0 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 16 Jul 2021 17:41:10 -0400
Subject: [PATCH v17 3/4] First attempt at addressing performance regressions
 from chunk approach

---
 src/common/wchar.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 84cc1c6785..79990fe972 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1877,6 +1877,14 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 			continue;
 		}
 
+		/*  Do a quick check if the first byte is both non-zero and doesn't have the high bit set */
+		if ((signed char) (*s) > 0)
+		{
+			s++;
+			len--;
+			continue;
+		}
+
 		/*
 		 * Found non-ASCII or zero above, so verify a single character.
 		 */
-- 
2.31.1

#53

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#50)

6 attachment(s)

Re: speed up verifying UTF-8

I wrote:

On Fri, Jul 16, 2021 at 1:44 AM Vladimir Sitnikov <

sitnikov.vladimir@gmail.com> wrote:

Have you considered shift-based DFA for a portable implementation

https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 ?

I did consider some kind of DFA a while back and it was too slow.

I took a closer look at this "shift-based DFA", and it seemed pretty
straightforward to implement this on top of my DFA attempt from some months
ago. The DFA technique is not a great fit with our API, since we need to
return how many bytes we found valid. On x86 (not our target for the
fallback, but convenient to test) all my attempts were either worse than
HEAD in multiple cases, or showed no improvement for the important ASCII
case. On Power8, it's more compelling, and competitive with v14, so I'll
characterize it on that platform as I describe the patch series:

0001 is a pure DFA, and has decent performance on multibyte, but terrible
on ascii.
0002 dispatches on the leading byte category, unrolls the DFA loop
according to how many valid bytes we need, and only checks the DFA state
afterwards. It's good on multibyte (3-byte, at least) but still terrible on
ascii.
0003 adds a 1-byte ascii fast path -- while robust on all inputs, it still
regresses a bit on ascii.
0004 uses the same 8-byte ascii check as previous patches do.
0005 and 0006 use combinations of 1- and 8-byte ascii checks similar to in
v17.

0005 seems the best on Power8, and is very close to v4. FWIW, v14's
measurements seem lucky and fragile -- if I change any little thing, even

- return -1;
+ return 0;

it easily loses 100-200ms on non-pure-ascii tests. That said, v14 still
seems the logical choice, unless there is some further tweak on top of v17
or v18 that gives some non-x86 platform a significant boost.

Power8, gcc 4.8:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2944 | 1523 | 871 | 1473 | 1509

v18-0001:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1257 | 1681 | 1385 | 1744 | 2018

v18-0002:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
951 | 1381 | 1217 | 1469 | 1172

v18-0003:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
911 | 1111 | 942 | 1112 | 865

v18-0004:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
987 | 730 | 222 | 1325 | 2306

v18-0005:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
962 | 664 | 180 | 928 | 1179

v18-0006:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
908 | 663 | 244 | 1026 | 1464

and for comparison,

v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
888 | 607 | 179 | 777 | 1328

v17-0003:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1205 | 662 | 180 | 767 | 1256

Macbook, clang 12:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
974 | 691 | 370 | 456 | 526

v18-0001:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1334 | 2713 | 2802 | 2665 | 2541

v18-0002:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
733 | 1212 | 1064 | 1034 | 1007

v18-0003:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
653 | 560 | 370 | 420 | 465

v18-0004:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
574 | 402 | 88 | 584 | 1033

v18-0005:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1345 | 730 | 334 | 578 | 909

v18-0006:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
674 | 485 | 153 | 594 | 989

and for comparison,

v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
674 | 346 | 78 | 309 | 504

v17-0002:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
516 | 324 | 78 | 331 | 544

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v18-0001-Use-pure-DFA.patchapplication/octet-stream; name=v18-0001-Use-pure-DFA.patchDownload

From 482df5ff99a40c64f1d87b08b41eac0206082eef Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 17:14:32 -0400
Subject: [PATCH v18 1/6] Use pure DFA

Based on https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
---
 src/common/wchar.c                       | 182 +++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  85 +++++++++++
 src/test/regress/sql/conversion.sql      |  57 +++++++
 3 files changed, 311 insertions(+), 13 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..aafc602bcd 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1757,32 +1757,188 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/* possible transition states for the UTF-8 DFA */
+
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+/* Start */
+#define	BGN UINT64CONST(0)
+/* Invalid sequence */
+#define	ERR (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)
+/* Partial 3-byte sequence states */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)
+/* Partial 4-byte sequence states */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)
+/* Start and End are the same state */
+#define	END BGN
+
+/*
+ * The DFA transition table would look like this if encoded as an array
+ * (ERR is lower case for readability).
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * =========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,     // BGN|END
+ * err, err, err, err, err, err, err, err, err, err, err, err,     // ERR
+ *
+ * err, err, END, END, END, err, err, err, err, err, err, err,     // CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,     // CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,     // CS3
+ *
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,     // P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,     // P3B
+ *
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,     // P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,     // P4B
+ */
+
+/* Encode each transition within DFA_BITS_PER_STATE-sized sequences of bits. */
+
+#define ERR_ON_ALL_NON_BGN_STATES (ERR << ERR) | (ERR << CS1) | (ERR << CS2) | (ERR << CS3) | (ERR << P3A) | (ERR << P3B) | (ERR << P4A) | (ERR << P4B)
+
+/* 00, C0..C1, F5..FF  Invalid bytes that never appear in a UTF-8 sequence */
+#define	ILL ERR | ERR_ON_ALL_NON_BGN_STATES
+
+/* 01..7F  Non-zero ASCII */
+#define NZA END | ERR_ON_ALL_NON_BGN_STATES
+
+/* 80..8F  Continuation range 1 */
+#define CR1 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (ERR << P4A) | (CS2 << P4B)
+
+/* 90..9F  Continuation range 2 */
+#define CR2 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (CS2 << P4A) | (ERR << P4B)
+
+/* A0..BF  Continuation range 3 */
+#define CR3 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (ERR << P3B) | (CS2 << P4A) | (ERR << P4B)
+
+/* C2..DF  2-byte lead */
+#define L2A CS1 | ERR_ON_ALL_NON_BGN_STATES
+
+/* E0      3-byte lead range A */
+#define L3A P3A | ERR_ON_ALL_NON_BGN_STATES
+
+/* E1..EC, EE..EF  3-byte lead range B */
+#define L3B CS2 | ERR_ON_ALL_NON_BGN_STATES
+
+/* ED      3-byte lead range C */
+#define L3C P3B | ERR_ON_ALL_NON_BGN_STATES
+
+/* F0      4-byte lead range A */
+#define L4A P4A | ERR_ON_ALL_NON_BGN_STATES
+
+/* F1..F3  4-byte lead range B */
+#define L4B CS3 | ERR_ON_ALL_NON_BGN_STATES
+
+/* F4      4-byte lead range C */
+#define L4C P4B | ERR_ON_ALL_NON_BGN_STATES
+
+/* maps an input byte to an 8-byte integer that encodes the possible state transitions */
+#define REP16(a) a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	REP16(CR1),
+
+	/* 90..9F */
+	REP16(CR2),
+
+	/* A0..BF */
+	REP16(CR3), REP16(CR3),
+
+	/* leading bytes */
+
+	/* C0..CF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* D0..DF */
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL,
+};
+
+
+static inline int
+utf8_advance(const unsigned char *s)
+{
+	uint64		class;
+	uint64		state = BGN;
+	int			l = 0;
+
+	do
+	{
+		class = ByteCategory[*s++];
+		state = (class >> state) & DFA_MASK;
+		l++;
+	} while (state > ERR);
+
+	if (state == ERR)
+		return -1;
+
+	Assert(l <= 4);
+	return l;
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
+	/*
+	 * fast path when we have enough bytes left in the string to cover all
+	 * valid UTF-8 sequences
+	 */
+	while (len >= 4)
+	{
+		int			l;
+
+		l = utf8_advance(s);
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* handle last few bytes */
 	while (len > 0)
 	{
 		int			l;
 
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..e4ab9fe765 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..e5a7e47958 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

v18-0002-Unroll-loop-in-DFA.patchapplication/octet-stream; name=v18-0002-Unroll-loop-in-DFA.patchDownload

From f3344f279b2781a63ef689fc49b0d28f7f8294a1 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 17:34:09 -0400
Subject: [PATCH v18 2/6] Unroll loop in DFA

We know exactly how many state transitions will happen for each
valid sequence, and we know we can't possibly overshoot the input
length, so compute all transitions and check the state at the end.
---
 src/common/wchar.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index aafc602bcd..0454e332cc 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1890,16 +1890,45 @@ utf8_advance(const unsigned char *s)
 	uint64		state = BGN;
 	int			l = 0;
 
-	do
+	class = ByteCategory[*s++];
+	state = class >> (state & DFA_MASK);
+
+	switch(class)
 	{
-		class = ByteCategory[*s++];
-		state = (class >> state) & DFA_MASK;
-		l++;
-	} while (state > ERR);
+		case L2A:
+			l = 2;
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			break;
+		case L3A:
+		case L3B:
+		case L3C:
+			l = 3;
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			break;
+		case L4A:
+		case L4B:
+		case L4C:
+			l = 4;
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			class = ByteCategory[*s++];
+			state = class >> (state & DFA_MASK);
+			break;
+		default:
+			l = 1;
+			Assert(class == NZA || class == ILL);
+	}
 
-	if (state == ERR)
+	if ((state & DFA_MASK) == ERR)
 		return -1;
 
+	Assert((state & DFA_MASK) == END);
 	Assert(l <= 4);
 	return l;
 }
-- 
2.31.1

v18-0004-Check-ascii-8-bytes-at-a-time-with-bitwise-opera.patchapplication/octet-stream; name=v18-0004-Check-ascii-8-bytes-at-a-time-with-bitwise-opera.patchDownload

From 6360898811851d0b5484e6aab7219cd7379c20e1 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 19:11:25 -0400
Subject: [PATCH v18 4/6] Check ascii 8-bytes at a time with bitwise operations

---
 src/common/wchar.c                       | 53 ++++++++++++++++++++----
 src/test/regress/expected/conversion.out |  2 +-
 src/test/regress/sql/conversion.sql      |  2 +-
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 4ea352bcf1..bba54912f9 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1932,24 +1932,63 @@ utf8_advance(const unsigned char *s)
 	return l;
 }
 
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
 	/*
-	 * fast path when we have enough bytes left in the string to cover all
-	 * valid UTF-8 sequences
+	 * fast path when we have enough bytes left in the string to use bitwise operations
 	 */
-	while (len >= 4)
+	while (len >= 8)
 	{
 		int			l;
 
-		/* check if the first byte is both non-zero and doesn't have the high bit set */
-		if ((signed char) (*s) > 0)
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, 8);
+		if (l)
 		{
-			s++;
-			len--;
+			s += l;
+			len -= l;
 			continue;
 		}
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e4ab9fe765..62461063b3 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -141,7 +141,7 @@ with test_bytes as (
 ), test_padded as (
   select
     description,
-    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+    (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error
   from test_bytes
 )
 select
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index e5a7e47958..5d0280cd1c 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -118,7 +118,7 @@ with test_bytes as (
 ), test_padded as (
   select
     description,
-    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+    (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error
   from test_bytes
 )
 select
-- 
2.31.1

v18-0003-Add-ascii-fast-path-before-resorting-to-DFA.patchapplication/octet-stream; name=v18-0003-Add-ascii-fast-path-before-resorting-to-DFA.patchDownload

From 61f7f10e5af10ff145adca54a0d019968cf77886 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 18:40:43 -0400
Subject: [PATCH v18 3/6] Add ascii fast-path before resorting to DFA

---
 src/common/wchar.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0454e332cc..4ea352bcf1 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1929,7 +1929,6 @@ utf8_advance(const unsigned char *s)
 		return -1;
 
 	Assert((state & DFA_MASK) == END);
-	Assert(l <= 4);
 	return l;
 }
 
@@ -1946,6 +1945,17 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	{
 		int			l;
 
+		/* check if the first byte is both non-zero and doesn't have the high bit set */
+		if ((signed char) (*s) > 0)
+		{
+			s++;
+			len--;
+			continue;
+		}
+
+		/*
+		 * Found non-ASCII or zero above, so verify a single character.
+		 */
 		l = utf8_advance(s);
 		if (l == -1)
 			goto end;
-- 
2.31.1

v18-0005-Do-1-byte-ascii-check-if-8-byte-check-fails.patchapplication/octet-stream; name=v18-0005-Do-1-byte-ascii-check-if-8-byte-check-fails.patchDownload

From 3ff5679947ebde4938998bc5742c8a5d8343de67 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 19:28:57 -0400
Subject: [PATCH v18 5/6] Do 1-byte ascii check if 8-byte check fails

---
 src/common/wchar.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index bba54912f9..465efd2e0d 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1992,6 +1992,14 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 			continue;
 		}
 
+		/* check if the first byte is both non-zero and doesn't have the high bit set */
+		if ((signed char) (*s) > 0)
+		{
+			s++;
+			len--;
+			continue;
+		}
+
 		/*
 		 * Found non-ASCII or zero above, so verify a single character.
 		 */
-- 
2.31.1

v18-0006-Do-8-byte-check-only-if-1-byte-check-succeeds.patchapplication/octet-stream; name=v18-0006-Do-8-byte-check-only-if-1-byte-check-succeeds.patchDownload

From d682593567d6b47ed12455caf6a50e9f0b6416fc Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 19:31:44 -0400
Subject: [PATCH v18 6/6] Do 8-byte check only if 1-byte check succeeds

---
 src/common/wchar.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 465efd2e0d..a388ff4cba 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1983,20 +1983,21 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	{
 		int			l;
 
-		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, 8);
-		if (l)
-		{
-			s += l;
-			len -= l;
-			continue;
-		}
-
 		/* check if the first byte is both non-zero and doesn't have the high bit set */
 		if ((signed char) (*s) > 0)
 		{
-			s++;
-			len--;
+			/* fast path for ASCII-subset characters */
+			l = check_ascii(s, 8);
+			if (l)
+			{
+				s += l;
+				len -= l;
+			}
+			else
+			{
+				s++;
+				len--;
+			}
 			continue;
 		}
 
-- 
2.31.1

#54

Amit Khandekar

amitdkhan.pg@gmail.com

over 4 years ago

In reply to: John Naylor (#51)

Re: speed up verifying UTF-8

On Sat, 17 Jul 2021 at 04:48, John Naylor <john.naylor@enterprisedb.com> wrote:

v17-0001 is the same as v14. 0002 is a stripped-down implementation of Amit's
chunk idea for multibyte, and it's pretty good on x86. On Power8, not so
much. 0003 and 0004 are shot-in-the-dark guesses to improve it on Power8,
with some success, but end up making x86 weirdly slow, so I'm afraid that
could happen on other platforms as well.

Thanks for trying the chunk approach. I tested your v17 versions on
Arm64. For the chinese characters, v17-0002 gave some improvement over
v14. But for all the other character sets, there was around 10%
degradation w.r.t. v14. I thought maybe the hhton64 call and memcpy()
for each mb character might be the culprit, so I tried iterating over
all the characters in the chunk within the same pg_utf8_verify_one()
function by left-shifting the bits. But that worsened the figures. So
I gave up that idea.

Here are the numbers on Arm64 :

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1781 | 1095 | 628 | 944 | 1151

v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
852 | 484 | 144 | 584 | 971

v17-0001+2:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
731 | 520 | 152 | 645 | 1118

Haven't looked at your v18 patch set yet.

#55

Vladimir Sitnikov

sitnikov.vladimir@gmail.com

over 4 years ago

In reply to: John Naylor (#53)

Re: speed up verifying UTF-8

Thank you,

It looks like it is important to have shrx for x86 which appears only when
-march=x86-64-v3 is used (see
https://github.com/golang/go/issues/47120#issuecomment-877629712 ).
Just in case: I know x86 wound not use fallback implementation, however,
the sole purpose of shift-based DFA is to fold all the data-dependent ops
into a single instruction.

An alternative idea: should we optimize for validation of **valid** inputs
rather than optimizing the worst case?
In other words, what if the implementation processes all characters always
and uses a slower method in case of validation failure?
I would guess it is more important to be faster with accepting valid input
rather than "faster to reject invalid input".

In shift-DFA approach, it would mean the validation loop would be simpler
with fewer branches (see https://godbolt.org/z/hhMxhT6cf ):

static inline int
pg_is_valid_utf8(const unsigned char *s, const unsigned char *end) {
uint64 class;
uint64 state = BGN;
while (s < end) { // clang unrolls the loop
class = ByteCategory[*s++];
state = class >> (state & DFA_MASK); // <-- note that AND is fused
into the shift operation
}
return (state & DFA_MASK) != ERR;
}

Note: GCC does not seem to unroll "while(s<end)" loop by default, so manual
unroll might be worth trying:

static inline int
pg_is_valid_utf8(const unsigned char *s, const unsigned char *end) {
uint64 class;
uint64 state = BGN;
while(s < end + 4) {
for(int i = 0; i < 4; i++) {
class = ByteCategory[*s++];
state = class >> (state & DFA_MASK);
}
}
while(s < end) {
class = ByteCategory[*s++];
state = class >> (state & DFA_MASK);
}
return (state & DFA_MASK) != ERR;
}

----

static int pg_utf8_verifystr2(const unsigned char *s, int len) {
if (pg_is_valid_utf8(s, s+len)) { // fast path: if string is valid,
then just accept it
return s + len;
}
// slow path: the string is not valid, perform a slower analysis
return s + ....;
}

Vladimir

#56

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Vladimir Sitnikov (#55)

Re: speed up verifying UTF-8

On Mon, Jul 19, 2021 at 9:43 AM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

It looks like it is important to have shrx for x86 which appears only

when -march=x86-64-v3 is used (see
https://github.com/golang/go/issues/47120#issuecomment-877629712 ).

Just in case: I know x86 wound not use fallback implementation, however,

the sole purpose of shift-based DFA is to fold all the data-dependent ops
into a single instruction.

I saw mention of that instruction, but didn't understand how important it
was, thanks.

An alternative idea: should we optimize for validation of **valid**

inputs rather than optimizing the worst case?

In other words, what if the implementation processes all characters

always and uses a slower method in case of validation failure?

I would guess it is more important to be faster with accepting valid

input rather than "faster to reject invalid input".

static int pg_utf8_verifystr2(const unsigned char *s, int len) {
if (pg_is_valid_utf8(s, s+len)) { // fast path: if string is valid,

then just accept it

return s + len;
}
// slow path: the string is not valid, perform a slower analysis
return s + ....;
}

That might be workable. We have to be careful because in COPY FROM,
validation is performed on 64kB chunks, and the boundary could fall in the
middle of a multibyte sequence. In the SSE version, there is this comment:

+ /*
+ * NB: This check must be strictly greater-than, otherwise an invalid byte
+ * at the end might not get detected.
+ */
+ while (len > sizeof(__m128i))

...which should have more detail on this.

--
John Naylor
EDB: http://www.enterprisedb.com

#57

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#56)

1 attachment(s)

Re: speed up verifying UTF-8

On Mon, Jul 19, 2021 at 9:43 AM Vladimir Sitnikov <

sitnikov.vladimir@gmail.com> wrote:

An alternative idea: should we optimize for validation of **valid**

inputs rather than optimizing the worst case?

In other words, what if the implementation processes all characters

always and uses a slower method in case of validation failure?

I would guess it is more important to be faster with accepting valid

input rather than "faster to reject invalid input".

static int pg_utf8_verifystr2(const unsigned char *s, int len) {
if (pg_is_valid_utf8(s, s+len)) { // fast path: if string is valid,

then just accept it

return s + len;
}
// slow path: the string is not valid, perform a slower analysis
return s + ....;
}

This turned out to be a really good idea (v19 attached):

Power8, gcc 4.8:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2944 | 1523 | 871 | 1473 | 1509

v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
888 | 607 | 179 | 777 | 1328

v19:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
809 | 472 | 223 | 558 | 805

x86 Macbook, clang 12:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
974 | 691 | 370 | 456 | 526

v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
674 | 346 | 78 | 309 | 504

v19:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
379 | 181 | 94 | 219 | 376

Note that the branchy code's worst case (mixed8) is here the same speed as
multibyte. With Vladimir's idea * , we call check_ascii only every 8 bytes
of input, not every time we verify one multibyte character. Also, we only
have to check the DFA state every time we loop over 8 bytes, not every time
we step through the DFA. That means we have to walk backwards at the end to
find the last leading byte, but the SSE code already knew how to do that,
so I used that logic here in the caller, which will allow some
simplification of how the SSE code returns.

The state check is likely why the ascii case is slightly slower than v14.
We could go back to checking ascii 16 bytes at a time, since there's little
penalty for doing so.

* (Greg was thinking the same thing upthread, but I don't think the branchy
code I posted at the time could have taken advantage of this)

I'm pretty confident this improvement is architecture-independent. Next
month I'll clean this up and rebase the SSE patch over this.

I wrote:

+ /*
+ * NB: This check must be strictly greater-than, otherwise an invalid

byte

+ * at the end might not get detected.
+ */
+ while (len > sizeof(__m128i))

Note to self: I actually think this isn't needed anymore since I changed
how the SSE code deals with remainder sequences at the end.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v19-rewrite-pg_verify_str-for-speed.patchapplication/octet-stream; name=v19-rewrite-pg_verify_str-for-speed.patchDownload

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..f0a7333985 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,12 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1757,28 +1763,265 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/* possible transition states for the UTF-8 DFA */
+
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+/* TODO: switch BGN and ERR to make the state transition encodings more readable */
+
+/* Start */
+#define	BGN UINT64CONST(0)
+/* Invalid sequence */
+#define	ERR (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)
+/* Partial 3-byte sequence states */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)
+/* Partial 4-byte sequence states */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)
+/* Start and End are the same state */
+#define	END BGN
+
+/*
+ * The DFA transition table would look like this if encoded as an array
+ * (ERR is lower case for readability). TODO: source
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * =========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,     // BGN|END
+ * err, err, err, err, err, err, err, err, err, err, err, err,     // ERR
+ *
+ * err, err, END, END, END, err, err, err, err, err, err, err,     // CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,     // CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,     // CS3
+ *
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,     // P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,     // P3B
+ *
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,     // P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,     // P4B
+ */
+
+/*
+ * Encode each transition within DFA_BITS_PER_STATE-sized sequences of bits.
+ * Based on idea from Per Vognsen:
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ */
+
+#define ERR_ON_ALL_NON_BGN_STATES (ERR << ERR) | (ERR << CS1) | (ERR << CS2) | (ERR << CS3) | (ERR << P3A) | (ERR << P3B) | (ERR << P4A) | (ERR << P4B)
+
+/* Invalid bytes */
+#define ILL ERR | ERR_ON_ALL_NON_BGN_STATES
+
+/* Non-zero ASCII */
+#define NZA END | ERR_ON_ALL_NON_BGN_STATES
+
+/* continuation bytes */
+#define CR1 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (ERR << P4A) | (CS2 << P4B)
+#define CR2 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (CS2 << P4A) | (ERR << P4B)
+#define CR3 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (ERR << P3B) | (CS2 << P4A) | (ERR << P4B)
+
+/* 2-byte lead */
+#define L2A CS1 | ERR_ON_ALL_NON_BGN_STATES
+
+/* 3-byte leads */
+#define L3A P3A | ERR_ON_ALL_NON_BGN_STATES
+#define L3B CS2 | ERR_ON_ALL_NON_BGN_STATES
+#define L3C P3B | ERR_ON_ALL_NON_BGN_STATES
+
+/* 4-byte leads */
+#define L4A P4A | ERR_ON_ALL_NON_BGN_STATES
+#define L4B CS3 | ERR_ON_ALL_NON_BGN_STATES
+#define L4C P4B | ERR_ON_ALL_NON_BGN_STATES
+
+/* maps an input byte to an integer that encodes the state transitions */
+#define REP16(a) a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	REP16(CR1),
+
+	/* 90..9F */
+	REP16(CR2),
+
+	/* A0..BF */
+	REP16(CR3), REP16(CR3),
+
+	/* leading bytes */
+
+	/* C0..CF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* D0..DF */
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL,
+};
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbits_set,
+				highbit_carry;
+
+	// TODO: consider replacing this with an assert.
+	if (len >= sizeof(uint64))
+	{
+		memcpy(&chunk, s, sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = chunk & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		highbit_carry &= UINT64CONST(0x8080808080808080);
+		if (highbit_carry == UINT64CONST(0x8080808080808080))
+			return sizeof(uint64);
+		else
+			return 0;
+	}
+	else
+		return 0;
+}
+
+static inline uint64
+utf8_advance(const unsigned char *s, uint64 state, int len)
+{
+	while (len > 0)
+	{
+		state = ByteCategory[*s++] >> (state & DFA_MASK);
+		len--;
+	}
+
+	return state & DFA_MASK;
+}
+
 static int
-pg_utf8_verifystr(const unsigned char *s, int len)
+utf8_verifystr_fast(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		state = BGN;
 
-	while (len > 0)
+#define STRIDE_LENGTH 8
+
+	/*
+	 * fast path when we have enough bytes left in the string to use bitwise
+	 * operations
+	 */
+	while (len > STRIDE_LENGTH)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
+		l = check_ascii(s, STRIDE_LENGTH);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end.
+		 *
+		 * WIP: if check_ascii returned END / ERR, this could be shortened to
+		 * if (l != state).
+		 */
+		if (!(l == STRIDE_LENGTH && state == END))
 		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
+			state = utf8_advance(s, state, STRIDE_LENGTH);
 		}
+
+		s += STRIDE_LENGTH;
+		len -= STRIDE_LENGTH;
+	}
+
+	/*
+	 * If we saw an error any time during the loop, let the caller handle it.
+	 */
+	if (state == ERR)
+		return 0;
+
+	/*
+	 * Even if we didn't reach the END state, the caller knows to search for
+	 * the last possible valid character.
+	 */
+	return s - start;
+}
+
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	int			valid_bytes = 0;
+
+	/* For longer strings, verify multiple bytes at a time. */
+	if (len > 8)
+	{
+		valid_bytes = utf8_verifystr_fast(s, len);
+		s += valid_bytes;
+		len -= valid_bytes;
+	}
+
+	/*
+	 * For short strings, verify one character at a time. For the last few
+	 * bytes of a longer sequence, we first walk backwards to find the last
+	 * byte that could have been the start of a valid character.
+	 */
+	while (s > start)
+	{
+		s--;
+		len++;
+
+		if (((signed char) (*s) > 0) ||
+			IS_TWO_BYTE_LEAD(*s) ||
+			IS_THREE_BYTE_LEAD(*s) ||
+			IS_FOUR_BYTE_LEAD(*s))
+			break;
+	}
+
+	while (len > 0)
+	{
+		int			l;
+
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..62461063b3 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..5d0280cd1c 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),

#58

Thomas Munro

thomas.munro@gmail.com

over 4 years ago

In reply to: John Naylor (#22)

2 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On Sat, Mar 13, 2021 at 4:37 AM John Naylor
<john.naylor@enterprisedb.com> wrote:

On Fri, Mar 12, 2021 at 9:14 AM Amit Khandekar <amitdkhan.pg@gmail.com> wrote:

I was not thinking about auto-vectorizing the code in
pg_validate_utf8_sse42(). Rather, I was considering auto-vectorization
inside the individual helper functions that you wrote, such as
_mm_setr_epi8(), shift_right(), bitwise_and(), prev1(), splat(),

If the PhD holders who came up with this algorithm thought it possible to do it that way, I'm sure they would have. In reality, simdjson has different files for SSE4, AVX, AVX512, NEON, and Altivec. We can incorporate any of those as needed. That's a PG15 project, though, and I'm not volunteering.

Just for fun/experimentation, here's a quick (and probably too naive)
translation of those helper functions to NEON, on top of the v15
patch.

Attachments:

0001-XXX-Make-SIMD-code-more-platform-neutral.txttext/plain; charset=US-ASCII; name=0001-XXX-Make-SIMD-code-more-platform-neutral.txtDownload

From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 22 Jul 2021 02:05:06 +1200
Subject: [PATCH 1/2] XXX Make SIMD code more platform neutral.

Move SIMD code into pg_utf_simd.c to experiment with the idea of a
shared implementation across architectures.  Introduce pg_u8x16_t to
abstract vector type.

XXX Experiment grade code only
---
 configure                                     |  18 +--
 configure.ac                                  |  18 +--
 src/include/pg_config.h.in                    |  14 +-
 src/include/port/pg_utf8.h                    |  10 +-
 src/port/Makefile                             |   8 +-
 ...g_utf8_sse42_choose.c => pg_utf8_choose.c} |  10 +-
 src/port/{pg_utf8_sse42.c => pg_utf8_simd.c}  | 148 +++++++++---------
 7 files changed, 114 insertions(+), 112 deletions(-)
 rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%)
 rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%)

diff --git a/configure b/configure
index 30969840b1..df546b641c 100755
--- a/configure
+++ b/configure
@@ -18442,13 +18442,13 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -18461,19 +18461,19 @@ fi
 # Note: We need the fallback for error handling in all builds.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
 $as_echo_n "checking which UTF-8 validator to use... " >&6; }
-if test x"$USE_SSE42_UTF8" = x"1"; then
+if test x"$USE_SIMD_UTF8" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h
 
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 5e2b4717c1..1606a80fb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -2235,14 +2235,14 @@ fi
 # Set PG_UTF8_OBJS appropriately depending on the selected implementation.
 # Note: We need the fallback for error handling in all builds.
 AC_MSG_CHECKING([which UTF-8 validator to use])
-if test x"$USE_SSE42_UTF8" = x"1"; then
-  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+if test x"$USE_SIMD_UTF8" = x"1"; then
+  AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 9d5e1efda9..f1456553e6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -932,14 +935,11 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
-/* Define to 1 to use the fallback UTF-8 validator written in C. */
-#undef USE_FALLBACK_UTF8
-
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
-#undef USE_SSE42_UTF8
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SIMD_UTF8
 
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
-#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK
 
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dc38369a31..a9f2b9f15b 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -15,14 +15,14 @@
 #define PG_UTF8_H
 
 
-#if defined(USE_SSE42_UTF8)
+#if defined(USE_SIMD_UTF8)
 /* Use Intel SSE4.2 instructions. */
 #define UTF8_VERIFYSTR(s, len) \
-	pg_validate_utf8_sse42((s), (len))
+	pg_validate_utf8_simd((s), (len))
 
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
-#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK)
 /*
  * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
@@ -31,7 +31,7 @@ extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
 	pg_validate_utf8((s), (len))
 
 extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
 #else
 #define UTF8_VERIFYSTR(s, len) \
diff --git a/src/port/Makefile b/src/port/Makefile
index 04838b0ab2..893fcd7d59 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -90,10 +90,10 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
-# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
-pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+# all versions of pg_utf8_simd.o need CFLAGS_SSE42
+pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42)
 
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c
similarity index 88%
rename from src/port/pg_utf8_sse42_choose.c
rename to src/port/pg_utf8_choose.c
index ff6120be2b..140c0dce7b 100644
--- a/src/port/pg_utf8_sse42_choose.c
+++ b/src/port/pg_utf8_choose.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42_choose.c
- *	  Choose between Intel SSE 4.2 and fallback implementation.
+ * pg_utf8_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
  * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
@@ -30,7 +30,7 @@
 #include "port/pg_utf8.h"
 
 static bool
-pg_utf8_sse42_available(void)
+pg_utf8_simd_available(void)
 {
 	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
 #ifdef __x86_64__
@@ -57,8 +57,8 @@ pg_utf8_sse42_available(void)
 static int
 pg_validate_utf8_choose(const unsigned char *s, int len)
 {
-	if (pg_utf8_sse42_available())
-		pg_validate_utf8 = pg_validate_utf8_sse42;
+	if (pg_utf8_simd_available())
+		pg_validate_utf8 = pg_validate_utf8_simd;
 	else
 		pg_validate_utf8 = pg_validate_utf8_fallback;
 
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c
similarity index 76%
rename from src/port/pg_utf8_sse42.c
rename to src/port/pg_utf8_simd.c
index cd050ec2bf..7ca9060e3a 100644
--- a/src/port/pg_utf8_sse42.c
+++ b/src/port/pg_utf8_simd.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42.c
+ * pg_utf8_simd.c
  *	  Validate UTF-8 using Intel SSE 4.2 instructions.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  src/port/pg_utf8_sse42.c
+ *	  src/port/pg_utf8_simd.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "port/pg_utf8.h"
 
+typedef __m128i pg_u8x16_t;
+
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
  * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
@@ -184,48 +186,48 @@
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
 
 /* return a zeroed register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vzero()
 {
 	return _mm_setzero_si128();
 }
 
 /* perform an unaligned load from memory into a register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
-	return _mm_loadu_si128((const __m128i *) raw_input);
+	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
-static inline __m128i
+static inline pg_u8x16_t
 splat(char byte)
 {
 	return _mm_set1_epi8(byte);
 }
 
 /* perform signed greater-than on all 8-bit lanes */
-static inline __m128i
-greater_than(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_cmpgt_epi8(v1, v2);
 }
 
 /* bitwise vector operations */
-static inline __m128i
-bitwise_and(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_and_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_or(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_or_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_xor(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_xor_si128(v1, v2);
 }
@@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2)
  * on overflow, stop at zero. Useful for emulating unsigned
  * comparison.
  */
-static inline __m128i
-saturating_sub(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_subs_epu8(v1, v2);
 }
@@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2)
  * There is no intrinsic to do this on 8-bit lanes, so shift right in each
  * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
-static inline __m128i
-shift_right(const __m128i v, const int n)
+static inline pg_u8x16_t
+shift_right(const pg_u8x16_t v, const int n)
 {
-	const		__m128i shift16 = _mm_srli_epi16(v, n);
-	const		__m128i mask = splat(0xFF >> n);
+	const		pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
+	const		pg_u8x16_t mask = splat(0xFF >> n);
 
 	return bitwise_and(shift16, mask);
 }
@@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n)
  * The third argument to the intrinsic must be a numeric constant, so
  * we must have separate functions for different shift amounts.
  */
-static inline __m128i
-prev1(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 }
 
-static inline __m128i
-prev2(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 }
 
-static inline __m128i
-prev3(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 }
 
 /*
  * For each 8-bit lane in the input, use that value as an index
  * into the lookup vector as if it were a 16-element byte array.
  */
-static inline __m128i
-lookup(const __m128i input, const __m128i lookup)
+static inline pg_u8x16_t
+lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
 	return _mm_shuffle_epi8(lookup, input);
 }
@@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup)
  * Return a vector with lanes non-zero where we have either errors, or
  * two or more continuations in a row.
  */
-static inline __m128i
-check_special_cases(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
-	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
-	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
-	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
 
 	/*
 	 * To classify the first byte in each chunk we need to have the last byte
 	 * from the previous chunk.
 	 */
-	const		__m128i input_shift1 = prev1(prev, input);
+	const		pg_u8x16_t input_shift1 = prev1(prev, input);
 
 	/* put the relevant nibbles into their own bytes in their own registers */
-	const		__m128i byte_1_high = shift_right(input_shift1, 4);
-	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
-	const		__m128i byte_2_high = shift_right(input, 4);
+	const		pg_u8x16_t byte_1_high = shift_right(input_shift1, 4);
+	const		pg_u8x16_t byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		pg_u8x16_t byte_2_high = shift_right(input, 4);
 
 	/* lookup the possible errors for each set of nibbles */
-	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
-	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
-	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+	const		pg_u8x16_t lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		pg_u8x16_t lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		pg_u8x16_t lookup_2_high = lookup(byte_2_high, byte_2_high_table);
 
 	/*
 	 * AND all the lookups together. At this point, non-zero lanes in the
@@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input)
 	 *
 	 * 3. the third continuation byte of a 4-byte character
 	 */
-	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+	const		pg_u8x16_t temp = bitwise_and(lookup_1_high, lookup_1_low);
 
 	return bitwise_and(temp, lookup_2_high);
 }
@@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i input)
  * Return a vector with lanes set to TWO_CONTS where we expect to find two
  * continuations in a row. These are valid only within 3- and 4-byte sequences.
  */
-static inline __m128i
-check_multibyte_lengths(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
 	/*
 	 * Populate registers that contain the input shifted right by 2 and 3
 	 * bytes, filling in the left lanes from the previous input.
 	 */
-	const		__m128i input_shift2 = prev2(prev, input);
-	const		__m128i input_shift3 = prev3(prev, input);
+	const		pg_u8x16_t input_shift2 = prev2(prev, input);
+	const		pg_u8x16_t input_shift3 = prev3(prev, input);
 
 	/*
 	 * Constants for comparison. Any 3-byte lead is greater than
 	 * MAX_TWO_BYTE_LEAD, etc.
 	 */
-	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
-	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+	const		pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD);
 
 	/*
 	 * Look in the shifted registers for 3- or 4-byte leads. There is no
@@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 	 * signed comparison with zero. Any non-zero bytes in the result represent
 	 * valid leads.
 	 */
-	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
-	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+	const		pg_u8x16_t is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		pg_u8x16_t is_fourth_byte = saturating_sub(input_shift3, max_lead3);
 
 	/* OR them together for easier comparison */
-	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+	const		pg_u8x16_t temp = bitwise_or(is_third_byte, is_fourth_byte);
 
 	/*
 	 * Set all bits in each 8-bit lane if the result is greater than zero.
 	 * Signed arithmetic is okay because the values are small.
 	 */
-	const		__m128i must23 = greater_than(temp, vzero());
+	const		pg_u8x16_t must23 = greater_than(temp, vzero());
 
 	/*
 	 * We want to compare with the result of check_special_cases() so apply a
@@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 
 /* set bits in the error vector where we find invalid UTF-8 input */
 static inline void
-check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * error)
 {
-	const		__m128i special_cases = check_special_cases(prev, input);
-	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+	const		pg_u8x16_t special_cases = check_special_cases(prev, input);
+	const		pg_u8x16_t expect_two_conts = check_multibyte_lengths(prev, input);
 
 	/* If the two cases are identical, this will be zero. */
-	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+	const		pg_u8x16_t result = bitwise_xor(expect_two_conts, special_cases);
 
 	*error = bitwise_or(*error, result);
 }
 
 /* return false if a register is zero, true otherwise */
 static inline bool
-to_bool(const __m128i v)
+to_bool(const pg_u8x16_t v)
 {
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -409,25 +411,25 @@ to_bool(const __m128i v)
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
-check_for_zeros(const __m128i v, __m128i * error)
+check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
-	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 
 	*error = bitwise_or(*error, cmp);
 }
 
 /* vector version of IS_HIGHBIT_SET() */
 static inline bool
-is_highbit_set(const __m128i v)
+is_highbit_set(const pg_u8x16_t v)
 {
 	return _mm_movemask_epi8(v) != 0;
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-static inline __m128i
-is_incomplete(const __m128i v)
+static inline pg_u8x16_t
+is_incomplete(const pg_u8x16_t v)
 {
-	const		__m128i max_array =
+	const		pg_u8x16_t max_array =
 	vset(0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
@@ -440,20 +442,20 @@ is_incomplete(const __m128i v)
  * See the comment in common/wchar.c under "multibyte sequence validators".
  */
 int
-pg_validate_utf8_sse42(const unsigned char *s, int len)
+pg_validate_utf8_simd(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 	const int	orig_len = len;
-	__m128i		error = vzero();
-	__m128i		prev = vzero();
-	__m128i		prev_incomplete = vzero();
-	__m128i		input;
+	pg_u8x16_t		error = vzero();
+	pg_u8x16_t		prev = vzero();
+	pg_u8x16_t		prev_incomplete = vzero();
+	pg_u8x16_t		input;
 
 	/*
 	 * NB: This check must be strictly greater-than, otherwise an invalid byte
 	 * at the end might not get detected.
 	 */
-	while (len > sizeof(__m128i))
+	while (len > sizeof(pg_u8x16_t))
 	{
 		input = vload(s);
 
@@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len)
 		}
 
 		prev = input;
-		s += sizeof(__m128i);
-		len -= sizeof(__m128i);
+		s += sizeof(pg_u8x16_t);
+		len -= sizeof(pg_u8x16_t);
 	}
 
 	/*
-- 
2.30.2

0002-XXX-Add-ARM-NEON-support-for-UTF-8-validation.txttext/plain; charset=US-ASCII; name=0002-XXX-Add-ARM-NEON-support-for-UTF-8-validation.txtDownload

From ee96ffe76422bdd9e3d56ab2a2d56db3458cbf4d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 22 Jul 2021 02:05:30 +1200
Subject: [PATCH 2/2] XXX Add ARM/NEON support for UTF-8 validation.

Needs configure checks.
Needs "choose" logic.  Probably a SIGILL test, as done elsewhere?
For now works only if you configure with USE_UTF8_SIMD=1.

XXX Experiment grade code only
---
 src/port/pg_utf8_simd.c | 102 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index 7ca9060e3a..f52d9c98d6 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -15,11 +15,27 @@
 
 #include "c.h"
 
+#if defined(__aarch64__)
+#define USE_NEON
+#elif defined(__x86_64__)
+#define USE_SSE
+#else
+#error "Unsupported architecture"
+#endif
+
+#if defined(USE_NEON)
+#include <arm_neon.h>
+#elif defined(USE_SSE)
 #include <nmmintrin.h>
+#endif
 
 #include "port/pg_utf8.h"
 
+#if defined(USE_NEON)
+typedef uint8x16_t pg_u8x16_t;
+#elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#endif
 
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
@@ -183,53 +199,95 @@ typedef __m128i pg_u8x16_t;
 
 /* helper functions to wrap intrinsics */
 
+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+	 uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+	 uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+	 uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+	uint8 pg_attribute_aligned(16) values[16] = {
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+	};
+	return vld1q_u8(values);
+}
+#elif defined(USE_SSE)
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+#endif
 
 /* return a zeroed register */
 static inline const pg_u8x16_t
 vzero()
 {
+#if defined(USE_NEON)
+	return vmovq_n_u8(0);
+#elif defined(USE_SSE)
 	return _mm_setzero_si128();
+#endif
 }
 
 /* perform an unaligned load from memory into a register */
 static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
+#if defined(USE_NEON)
+	return vld1q_u8(raw_input);
+#elif defined(USE_SSE)
 	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
+#endif
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
 static inline pg_u8x16_t
 splat(char byte)
 {
+#if defined(USE_NEON)
+	return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_SSE)
 	return _mm_set1_epi8(byte);
+#endif
 }
 
 /* perform signed greater-than on all 8-bit lanes */
 static inline pg_u8x16_t
 greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_SSE)
 	return _mm_cmpgt_epi8(v1, v2);
+#endif
 }
 
 /* bitwise vector operations */
 static inline pg_u8x16_t
 bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vandq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_and_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vorrq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_or_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return veorq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_xor_si128(v1, v2);
+#endif
 }
 
 /*
@@ -240,22 +298,32 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 static inline pg_u8x16_t
 saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vqsubq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_subs_epu8(v1, v2);
+#endif
 }
 
 /*
  * Shift right each 8-bit lane
- *
- * There is no intrinsic to do this on 8-bit lanes, so shift right in each
- * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
 static inline pg_u8x16_t
 shift_right(const pg_u8x16_t v, const int n)
 {
+#if defined(USE_NEON)
+	return vshrq_n_u8(v, n);
+#elif defined(USE_SSE)
+	/*
+	 * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+	 * 16-bit lane then apply a mask in each 8-bit lane shifted the same
+	 * amount.
+	 */
 	const		pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
 	const		pg_u8x16_t mask = splat(0xFF >> n);
 
 	return bitwise_and(shift16, mask);
+#endif
 }
 
 /*
@@ -271,19 +339,31 @@ shift_right(const pg_u8x16_t v, const int n)
 static inline pg_u8x16_t
 prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
+#endif
 }
 
 static inline pg_u8x16_t
 prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
+#endif
 }
 
 static inline pg_u8x16_t
 prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
+#endif
 }
 
 /*
@@ -293,7 +373,11 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 static inline pg_u8x16_t
 lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
+#if defined(USE_NEON)
+	return vqtbl1q_u8(lookup, input);
+#elif defined(USE_SSE)
 	return _mm_shuffle_epi8(lookup, input);
+#endif
 }
 
 /*
@@ -402,18 +486,26 @@ check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * err
 static inline bool
 to_bool(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+	return vmaxvq_u32((uint32x4_t) v) != 0;
+#elif defined(USE_SSE)
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
 	 * zero. Zero is the only value whose bitwise AND with itself is zero.
 	 */
 	return !_mm_testz_si128(v, v);
+#endif
 }
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
 check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
+#if defined(USE_NEON)
+	const		pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_SSE)
 	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
+#endif
 
 	*error = bitwise_or(*error, cmp);
 }
@@ -422,7 +514,11 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 static inline bool
 is_highbit_set(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+	return to_bool(bitwise_and(v, vmovq_n_u8(0x80)));
+#elif defined(USE_SSE)
 	return _mm_movemask_epi8(v) != 0;
+#endif
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-- 
2.30.2

#59

Vladimir Sitnikov

sitnikov.vladimir@gmail.com

over 4 years ago

In reply to: John Naylor (#57)

Re: speed up verifying UTF-8

I'm pretty confident this improvement is architecture-independent.

Thanks for testing it with different architectures.

It looks like the same utf8_advance function is good for both fast-path and
for the slow path.
Then pg_utf8_verifychar could be removed altogether along with the
corresponding IS_*_BYTE_LEAD macros.

Vladimir

#60

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Vladimir Sitnikov (#59)

Re: speed up verifying UTF-8

On Wed, Jul 21, 2021 at 12:13 PM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

It looks like the same utf8_advance function is good for both fast-path

and for the slow path.

Then pg_utf8_verifychar could be removed altogether along with the

corresponding IS_*_BYTE_LEAD macros.

pg_utf8_verifychar() is a public function usually called
through pg_wchar_table[], so it needs to remain in any case.

--
John Naylor
EDB: http://www.enterprisedb.com

#61

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Thomas Munro (#58)

Re: [POC] verifying UTF-8 using SIMD instructions

On Wed, Jul 21, 2021 at 11:29 AM Thomas Munro <thomas.munro@gmail.com>
wrote:

Just for fun/experimentation, here's a quick (and probably too naive)
translation of those helper functions to NEON, on top of the v15
patch.

Neat! It's good to make it more architecture-agnostic, and I'm sure we can
use quite a bit of this. I don't know enough about NEON to comment
intelligently, but a quick glance through the simdjson source show a couple
differences that might be worth a look:

 to_bool(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+ return vmaxvq_u32((uint32x4_t) v) != 0;

--> return vmaxvq_u8(*this) != 0;

 vzero()
 {
+#if defined(USE_NEON)
+ return vmovq_n_u8(0);

--> return vdupq_n_u8(0); // or equivalently, splat(0)

is_highbit_set(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+ return to_bool(bitwise_and(v, vmovq_n_u8(0x80)));

--> return vmaxq_u8(v) > 0x7F

(Technically, their convention is: is_ascii(v) { return vmaxq_u8(v) < 0x80;
} , but same effect)

+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+ uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+ uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+ uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+ uint8 pg_attribute_aligned(16) values[16] = {
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+ };
+ return vld1q_u8(values);
+}

--> They have this strange beast instead:

// Doing a load like so end ups generating worse code.
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_u8(array);
uint8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_u8(x1, x, 0);
x = vsetq_lane_u8(x2, x, 1);
x = vsetq_lane_u8(x3, x, 2);
...
x = vsetq_lane_u8(x15, x, 14);
x = vsetq_lane_u8(x16, x, 15);
return x;

Since you aligned the array, that might not have the problem alluded to
above, and it looks nicer.

--
John Naylor
EDB: http://www.enterprisedb.com

#62

Thomas Munro

thomas.munro@gmail.com

over 4 years ago

In reply to: John Naylor (#61)

3 attachment(s)

Re: [POC] verifying UTF-8 using SIMD instructions

On Thu, Jul 22, 2021 at 6:16 AM John Naylor
<john.naylor@enterprisedb.com> wrote:

Neat! It's good to make it more architecture-agnostic, and I'm sure we can use quite a bit of this.

One question is whether this "one size fits all" approach will be
extensible to wider SIMD.

to_bool(const pg_u8x16_t v)
{
+#if defined(USE_NEON)
+ return vmaxvq_u32((uint32x4_t) v) != 0;

--> return vmaxvq_u8(*this) != 0;

I chose that lane width because I saw an unsubstantiated claim
somewhere that it might be faster, but I have no idea if it matters.
The u8 code looks more natural anyway. Changed.

vzero()
{
+#if defined(USE_NEON)
+ return vmovq_n_u8(0);
--> return vdupq_n_u8(0); // or equivalently, splat(0)

I guess it doesn't make a difference which builtin you use here, but I
was influenced by the ARM manual which says the vdupq form is
generated for immediate values.

is_highbit_set(const pg_u8x16_t v)
{
+#if defined(USE_NEON)
+ return to_bool(bitwise_and(v, vmovq_n_u8(0x80)));

--> return vmaxq_u8(v) > 0x7F

Ah, of course. Much nicer!

+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+ uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+ uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+ uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+ uint8 pg_attribute_aligned(16) values[16] = {
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+ };
+ return vld1q_u8(values);
+}
--> They have this strange beast instead:

// Doing a load like so end ups generating worse code.
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_u8(array);
uint8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_u8(x1, x, 0);
x = vsetq_lane_u8(x2, x, 1);
x = vsetq_lane_u8(x3, x, 2);
...
x = vsetq_lane_u8(x15, x, 14);
x = vsetq_lane_u8(x16, x, 15);
return x;

Since you aligned the array, that might not have the problem alluded to above, and it looks nicer.

Strange indeed. We should probably poke around in the assember and
see... it might be that MSVC doesn't like it, and I was just
cargo-culting the alignment. I don't expect the generated code to
really "load" anything of course, it should ideally be some kind of
immediate mov...

FWIW here are some performance results from my humble RPI4:

master:

chinese | mixed | ascii
---------+-------+-------
4172 | 2763 | 1823
(1 row)

Your v15 patch:

chinese | mixed | ascii
---------+-------+-------
2267 | 1248 | 399
(1 row)

Your v15 patch set + the NEON patch, configured with USE_UTF8_SIMD=1:

chinese | mixed | ascii
---------+-------+-------
909 | 620 | 318
(1 row)

It's so good I wonder if it's producing incorrect results :-)

I also tried to do a quick and dirty AltiVec patch to see if it could
fit into the same code "shape", with less immediate success: it works
out slower than the fallback code on the POWER7 machine I scrounged an
account on. I'm not sure what's wrong there, but maybe it's a uesful
start (I'm probably confused about endianness, or the encoding of
boolean vectors which may be different (is true 0x01or 0xff, does it
matter?), or something else, and it's falling back on errors all the
time?).

Attachments:

v2-0001-XXX-Make-SIMD-code-more-platform-neutral.txttext/plain; charset=US-ASCII; name=v2-0001-XXX-Make-SIMD-code-more-platform-neutral.txtDownload

From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 22 Jul 2021 02:05:06 +1200
Subject: [PATCH v2 1/3] XXX Make SIMD code more platform neutral.

Move SIMD code into pg_utf_simd.c to experiment with the idea of a
shared implementation across architectures.  Introduce pg_u8x16_t to
abstract vector type.

XXX Experiment grade code only
---
 configure                                     |  18 +--
 configure.ac                                  |  18 +--
 src/include/pg_config.h.in                    |  14 +-
 src/include/port/pg_utf8.h                    |  10 +-
 src/port/Makefile                             |   8 +-
 ...g_utf8_sse42_choose.c => pg_utf8_choose.c} |  10 +-
 src/port/{pg_utf8_sse42.c => pg_utf8_simd.c}  | 148 +++++++++---------
 7 files changed, 114 insertions(+), 112 deletions(-)
 rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%)
 rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%)

diff --git a/configure b/configure
index 30969840b1..df546b641c 100755
--- a/configure
+++ b/configure
@@ -18442,13 +18442,13 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -18461,19 +18461,19 @@ fi
 # Note: We need the fallback for error handling in all builds.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
 $as_echo_n "checking which UTF-8 validator to use... " >&6; }
-if test x"$USE_SSE42_UTF8" = x"1"; then
+if test x"$USE_SIMD_UTF8" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h
 
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 5e2b4717c1..1606a80fb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -2235,14 +2235,14 @@ fi
 # Set PG_UTF8_OBJS appropriately depending on the selected implementation.
 # Note: We need the fallback for error handling in all builds.
 AC_MSG_CHECKING([which UTF-8 validator to use])
-if test x"$USE_SSE42_UTF8" = x"1"; then
-  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+if test x"$USE_SIMD_UTF8" = x"1"; then
+  AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 9d5e1efda9..f1456553e6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -932,14 +935,11 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
-/* Define to 1 to use the fallback UTF-8 validator written in C. */
-#undef USE_FALLBACK_UTF8
-
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
-#undef USE_SSE42_UTF8
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SIMD_UTF8
 
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
-#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK
 
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dc38369a31..a9f2b9f15b 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -15,14 +15,14 @@
 #define PG_UTF8_H
 
 
-#if defined(USE_SSE42_UTF8)
+#if defined(USE_SIMD_UTF8)
 /* Use Intel SSE4.2 instructions. */
 #define UTF8_VERIFYSTR(s, len) \
-	pg_validate_utf8_sse42((s), (len))
+	pg_validate_utf8_simd((s), (len))
 
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
-#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK)
 /*
  * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
@@ -31,7 +31,7 @@ extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
 	pg_validate_utf8((s), (len))
 
 extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
 #else
 #define UTF8_VERIFYSTR(s, len) \
diff --git a/src/port/Makefile b/src/port/Makefile
index 04838b0ab2..893fcd7d59 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -90,10 +90,10 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
-# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
-pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+# all versions of pg_utf8_simd.o need CFLAGS_SSE42
+pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42)
 
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c
similarity index 88%
rename from src/port/pg_utf8_sse42_choose.c
rename to src/port/pg_utf8_choose.c
index ff6120be2b..140c0dce7b 100644
--- a/src/port/pg_utf8_sse42_choose.c
+++ b/src/port/pg_utf8_choose.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42_choose.c
- *	  Choose between Intel SSE 4.2 and fallback implementation.
+ * pg_utf8_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
  * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
@@ -30,7 +30,7 @@
 #include "port/pg_utf8.h"
 
 static bool
-pg_utf8_sse42_available(void)
+pg_utf8_simd_available(void)
 {
 	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
 #ifdef __x86_64__
@@ -57,8 +57,8 @@ pg_utf8_sse42_available(void)
 static int
 pg_validate_utf8_choose(const unsigned char *s, int len)
 {
-	if (pg_utf8_sse42_available())
-		pg_validate_utf8 = pg_validate_utf8_sse42;
+	if (pg_utf8_simd_available())
+		pg_validate_utf8 = pg_validate_utf8_simd;
 	else
 		pg_validate_utf8 = pg_validate_utf8_fallback;
 
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c
similarity index 76%
rename from src/port/pg_utf8_sse42.c
rename to src/port/pg_utf8_simd.c
index cd050ec2bf..7ca9060e3a 100644
--- a/src/port/pg_utf8_sse42.c
+++ b/src/port/pg_utf8_simd.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42.c
+ * pg_utf8_simd.c
  *	  Validate UTF-8 using Intel SSE 4.2 instructions.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  src/port/pg_utf8_sse42.c
+ *	  src/port/pg_utf8_simd.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "port/pg_utf8.h"
 
+typedef __m128i pg_u8x16_t;
+
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
  * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
@@ -184,48 +186,48 @@
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
 
 /* return a zeroed register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vzero()
 {
 	return _mm_setzero_si128();
 }
 
 /* perform an unaligned load from memory into a register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
-	return _mm_loadu_si128((const __m128i *) raw_input);
+	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
-static inline __m128i
+static inline pg_u8x16_t
 splat(char byte)
 {
 	return _mm_set1_epi8(byte);
 }
 
 /* perform signed greater-than on all 8-bit lanes */
-static inline __m128i
-greater_than(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_cmpgt_epi8(v1, v2);
 }
 
 /* bitwise vector operations */
-static inline __m128i
-bitwise_and(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_and_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_or(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_or_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_xor(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_xor_si128(v1, v2);
 }
@@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2)
  * on overflow, stop at zero. Useful for emulating unsigned
  * comparison.
  */
-static inline __m128i
-saturating_sub(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_subs_epu8(v1, v2);
 }
@@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2)
  * There is no intrinsic to do this on 8-bit lanes, so shift right in each
  * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
-static inline __m128i
-shift_right(const __m128i v, const int n)
+static inline pg_u8x16_t
+shift_right(const pg_u8x16_t v, const int n)
 {
-	const		__m128i shift16 = _mm_srli_epi16(v, n);
-	const		__m128i mask = splat(0xFF >> n);
+	const		pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
+	const		pg_u8x16_t mask = splat(0xFF >> n);
 
 	return bitwise_and(shift16, mask);
 }
@@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n)
  * The third argument to the intrinsic must be a numeric constant, so
  * we must have separate functions for different shift amounts.
  */
-static inline __m128i
-prev1(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 }
 
-static inline __m128i
-prev2(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 }
 
-static inline __m128i
-prev3(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 }
 
 /*
  * For each 8-bit lane in the input, use that value as an index
  * into the lookup vector as if it were a 16-element byte array.
  */
-static inline __m128i
-lookup(const __m128i input, const __m128i lookup)
+static inline pg_u8x16_t
+lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
 	return _mm_shuffle_epi8(lookup, input);
 }
@@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup)
  * Return a vector with lanes non-zero where we have either errors, or
  * two or more continuations in a row.
  */
-static inline __m128i
-check_special_cases(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
-	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
-	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
-	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
 
 	/*
 	 * To classify the first byte in each chunk we need to have the last byte
 	 * from the previous chunk.
 	 */
-	const		__m128i input_shift1 = prev1(prev, input);
+	const		pg_u8x16_t input_shift1 = prev1(prev, input);
 
 	/* put the relevant nibbles into their own bytes in their own registers */
-	const		__m128i byte_1_high = shift_right(input_shift1, 4);
-	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
-	const		__m128i byte_2_high = shift_right(input, 4);
+	const		pg_u8x16_t byte_1_high = shift_right(input_shift1, 4);
+	const		pg_u8x16_t byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		pg_u8x16_t byte_2_high = shift_right(input, 4);
 
 	/* lookup the possible errors for each set of nibbles */
-	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
-	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
-	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+	const		pg_u8x16_t lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		pg_u8x16_t lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		pg_u8x16_t lookup_2_high = lookup(byte_2_high, byte_2_high_table);
 
 	/*
 	 * AND all the lookups together. At this point, non-zero lanes in the
@@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input)
 	 *
 	 * 3. the third continuation byte of a 4-byte character
 	 */
-	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+	const		pg_u8x16_t temp = bitwise_and(lookup_1_high, lookup_1_low);
 
 	return bitwise_and(temp, lookup_2_high);
 }
@@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i input)
  * Return a vector with lanes set to TWO_CONTS where we expect to find two
  * continuations in a row. These are valid only within 3- and 4-byte sequences.
  */
-static inline __m128i
-check_multibyte_lengths(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
 	/*
 	 * Populate registers that contain the input shifted right by 2 and 3
 	 * bytes, filling in the left lanes from the previous input.
 	 */
-	const		__m128i input_shift2 = prev2(prev, input);
-	const		__m128i input_shift3 = prev3(prev, input);
+	const		pg_u8x16_t input_shift2 = prev2(prev, input);
+	const		pg_u8x16_t input_shift3 = prev3(prev, input);
 
 	/*
 	 * Constants for comparison. Any 3-byte lead is greater than
 	 * MAX_TWO_BYTE_LEAD, etc.
 	 */
-	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
-	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+	const		pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD);
 
 	/*
 	 * Look in the shifted registers for 3- or 4-byte leads. There is no
@@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 	 * signed comparison with zero. Any non-zero bytes in the result represent
 	 * valid leads.
 	 */
-	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
-	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+	const		pg_u8x16_t is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		pg_u8x16_t is_fourth_byte = saturating_sub(input_shift3, max_lead3);
 
 	/* OR them together for easier comparison */
-	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+	const		pg_u8x16_t temp = bitwise_or(is_third_byte, is_fourth_byte);
 
 	/*
 	 * Set all bits in each 8-bit lane if the result is greater than zero.
 	 * Signed arithmetic is okay because the values are small.
 	 */
-	const		__m128i must23 = greater_than(temp, vzero());
+	const		pg_u8x16_t must23 = greater_than(temp, vzero());
 
 	/*
 	 * We want to compare with the result of check_special_cases() so apply a
@@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 
 /* set bits in the error vector where we find invalid UTF-8 input */
 static inline void
-check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * error)
 {
-	const		__m128i special_cases = check_special_cases(prev, input);
-	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+	const		pg_u8x16_t special_cases = check_special_cases(prev, input);
+	const		pg_u8x16_t expect_two_conts = check_multibyte_lengths(prev, input);
 
 	/* If the two cases are identical, this will be zero. */
-	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+	const		pg_u8x16_t result = bitwise_xor(expect_two_conts, special_cases);
 
 	*error = bitwise_or(*error, result);
 }
 
 /* return false if a register is zero, true otherwise */
 static inline bool
-to_bool(const __m128i v)
+to_bool(const pg_u8x16_t v)
 {
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -409,25 +411,25 @@ to_bool(const __m128i v)
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
-check_for_zeros(const __m128i v, __m128i * error)
+check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
-	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 
 	*error = bitwise_or(*error, cmp);
 }
 
 /* vector version of IS_HIGHBIT_SET() */
 static inline bool
-is_highbit_set(const __m128i v)
+is_highbit_set(const pg_u8x16_t v)
 {
 	return _mm_movemask_epi8(v) != 0;
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-static inline __m128i
-is_incomplete(const __m128i v)
+static inline pg_u8x16_t
+is_incomplete(const pg_u8x16_t v)
 {
-	const		__m128i max_array =
+	const		pg_u8x16_t max_array =
 	vset(0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
@@ -440,20 +442,20 @@ is_incomplete(const __m128i v)
  * See the comment in common/wchar.c under "multibyte sequence validators".
  */
 int
-pg_validate_utf8_sse42(const unsigned char *s, int len)
+pg_validate_utf8_simd(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 	const int	orig_len = len;
-	__m128i		error = vzero();
-	__m128i		prev = vzero();
-	__m128i		prev_incomplete = vzero();
-	__m128i		input;
+	pg_u8x16_t		error = vzero();
+	pg_u8x16_t		prev = vzero();
+	pg_u8x16_t		prev_incomplete = vzero();
+	pg_u8x16_t		input;
 
 	/*
 	 * NB: This check must be strictly greater-than, otherwise an invalid byte
 	 * at the end might not get detected.
 	 */
-	while (len > sizeof(__m128i))
+	while (len > sizeof(pg_u8x16_t))
 	{
 		input = vload(s);
 
@@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len)
 		}
 
 		prev = input;
-		s += sizeof(__m128i);
-		len -= sizeof(__m128i);
+		s += sizeof(pg_u8x16_t);
+		len -= sizeof(pg_u8x16_t);
 	}
 
 	/*
-- 
2.30.2

v2-0002-XXX-Add-ARM-NEON-support-for-UTF-8-validation.txttext/plain; charset=US-ASCII; name=v2-0002-XXX-Add-ARM-NEON-support-for-UTF-8-validation.txtDownload

From 8f2b0f0dfe70699695f9a3a64b4c737201791da0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 22 Jul 2021 02:05:30 +1200
Subject: [PATCH v2 2/3] XXX Add ARM/NEON support for UTF-8 validation.

Needs configure checks.
Needs "choose" logic.  Probably a SIGILL test, as done elsewhere?
For now works only if you configure with USE_UTF8_SIMD=1.

XXX Experiment grade code only
---
 src/port/pg_utf8_simd.c | 102 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index 7ca9060e3a..db0fe61b93 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -15,11 +15,27 @@
 
 #include "c.h"
 
+#if defined(__aarch64__)
+#define USE_NEON
+#elif defined(__x86_64__)
+#define USE_SSE
+#else
+#error "Unsupported architecture"
+#endif
+
+#if defined(USE_NEON)
+#include <arm_neon.h>
+#elif defined(USE_SSE)
 #include <nmmintrin.h>
+#endif
 
 #include "port/pg_utf8.h"
 
+#if defined(USE_NEON)
+typedef uint8x16_t pg_u8x16_t;
+#elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#endif
 
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
@@ -183,53 +199,95 @@ typedef __m128i pg_u8x16_t;
 
 /* helper functions to wrap intrinsics */
 
+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+	 uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+	 uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+	 uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+	uint8 pg_attribute_aligned(16) values[16] = {
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+	};
+	return vld1q_u8(values);
+}
+#elif defined(USE_SSE)
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
+#endif
 
 /* return a zeroed register */
 static inline const pg_u8x16_t
 vzero()
 {
+#if defined(USE_NEON)
+	return vmovq_n_u8(0);
+#elif defined(USE_SSE)
 	return _mm_setzero_si128();
+#endif
 }
 
 /* perform an unaligned load from memory into a register */
 static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
+#if defined(USE_NEON)
+	return vld1q_u8(raw_input);
+#elif defined(USE_SSE)
 	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
+#endif
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
 static inline pg_u8x16_t
 splat(char byte)
 {
+#if defined(USE_NEON)
+	return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_SSE)
 	return _mm_set1_epi8(byte);
+#endif
 }
 
 /* perform signed greater-than on all 8-bit lanes */
 static inline pg_u8x16_t
 greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_SSE)
 	return _mm_cmpgt_epi8(v1, v2);
+#endif
 }
 
 /* bitwise vector operations */
 static inline pg_u8x16_t
 bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vandq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_and_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vorrq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_or_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return veorq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_xor_si128(v1, v2);
+#endif
 }
 
 /*
@@ -240,22 +298,32 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 static inline pg_u8x16_t
 saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+	return vqsubq_u8(v1, v2);
+#elif defined(USE_SSE)
 	return _mm_subs_epu8(v1, v2);
+#endif
 }
 
 /*
  * Shift right each 8-bit lane
- *
- * There is no intrinsic to do this on 8-bit lanes, so shift right in each
- * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
 static inline pg_u8x16_t
 shift_right(const pg_u8x16_t v, const int n)
 {
+#if defined(USE_NEON)
+	return vshrq_n_u8(v, n);
+#elif defined(USE_SSE)
+	/*
+	 * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+	 * 16-bit lane then apply a mask in each 8-bit lane shifted the same
+	 * amount.
+	 */
 	const		pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
 	const		pg_u8x16_t mask = splat(0xFF >> n);
 
 	return bitwise_and(shift16, mask);
+#endif
 }
 
 /*
@@ -271,19 +339,31 @@ shift_right(const pg_u8x16_t v, const int n)
 static inline pg_u8x16_t
 prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
+#endif
 }
 
 static inline pg_u8x16_t
 prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
+#endif
 }
 
 static inline pg_u8x16_t
 prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
+#endif
 }
 
 /*
@@ -293,7 +373,11 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 static inline pg_u8x16_t
 lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
+#if defined(USE_NEON)
+	return vqtbl1q_u8(lookup, input);
+#elif defined(USE_SSE)
 	return _mm_shuffle_epi8(lookup, input);
+#endif
 }
 
 /*
@@ -402,18 +486,26 @@ check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * err
 static inline bool
 to_bool(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+	return vmaxvq_u8(v) != 0;
+#elif defined(USE_SSE)
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
 	 * zero. Zero is the only value whose bitwise AND with itself is zero.
 	 */
 	return !_mm_testz_si128(v, v);
+#endif
 }
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
 check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
+#if defined(USE_NEON)
+	const		pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_SSE)
 	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
+#endif
 
 	*error = bitwise_or(*error, cmp);
 }
@@ -422,7 +514,11 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 static inline bool
 is_highbit_set(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+	return vmaxvq_u8(v) > 0x7F;
+#elif defined(USE_SSE)
 	return _mm_movemask_epi8(v) != 0;
+#endif
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-- 
2.30.2

v2-0003-XXX-Add-POWER-AltiVec-support-for-UTF-8-validation.txttext/plain; charset=US-ASCII; name=v2-0003-XXX-Add-POWER-AltiVec-support-for-UTF-8-validation.txtDownload

From 5e43a7af0fae5d8e75ad3c92237ff2935b97217f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 21 Jul 2021 23:28:01 +0000
Subject: [PATCH v2 3/3] XXX Add POWER AltiVec support for UTF-8 validation.

XXX This isn't right yet
---
 src/port/pg_utf8_simd.c | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index db0fe61b93..58978db213 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -19,6 +19,8 @@
 #define USE_NEON
 #elif defined(__x86_64__)
 #define USE_SSE
+#elif defined(__powerpc__)
+#define USE_ALTIVEC
 #else
 #error "Unsupported architecture"
 #endif
@@ -27,6 +29,8 @@
 #include <arm_neon.h>
 #elif defined(USE_SSE)
 #include <nmmintrin.h>
+#elif defined(USE_ALTIVEC)
+#include <altivec.h>
 #endif
 
 #include "port/pg_utf8.h"
@@ -35,6 +39,8 @@
 typedef uint8x16_t pg_u8x16_t;
 #elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#elif defined(USE_ALTIVEC)
+typedef vector unsigned char pg_u8x16_t;
 #endif
 
 /*
@@ -211,6 +217,18 @@ vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
 	};
 	return vld1q_u8(values);
 }
+#elif defined(USE_ALTIVEC)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+	 uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+	 uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+	 uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+	pg_u8x16_t v = {
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+	};
+	return v;
+}
 #elif defined(USE_SSE)
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
 #endif
@@ -221,6 +239,8 @@ vzero()
 {
 #if defined(USE_NEON)
 	return vmovq_n_u8(0);
+#elif defined(USE_ALTIVEC)
+	return vset(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 #elif defined(USE_SSE)
 	return _mm_setzero_si128();
 #endif
@@ -232,6 +252,8 @@ vload(const unsigned char *raw_input)
 {
 #if defined(USE_NEON)
 	return vld1q_u8(raw_input);
+#elif defined(USE_ALTIVEC)
+	return vec_ld(0, raw_input);
 #elif defined(USE_SSE)
 	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 #endif
@@ -243,6 +265,8 @@ splat(char byte)
 {
 #if defined(USE_NEON)
 	return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_ALTIVEC)
+	return vec_splats((unsigned char) byte);
 #elif defined(USE_SSE)
 	return _mm_set1_epi8(byte);
 #endif
@@ -254,6 +278,8 @@ greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
 	return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_ALTIVEC)
+	return (pg_u8x16_t) vec_cmpgt((vector signed char) v1, (vector signed char) v2);
 #elif defined(USE_SSE)
 	return _mm_cmpgt_epi8(v1, v2);
 #endif
@@ -265,6 +291,8 @@ bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
 	return vandq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+	return vec_and(v1, v2);
 #elif defined(USE_SSE)
 	return _mm_and_si128(v1, v2);
 #endif
@@ -275,6 +303,8 @@ bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
 	return vorrq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+	return vec_or(v1, v2);
 #elif defined(USE_SSE)
 	return _mm_or_si128(v1, v2);
 #endif
@@ -285,6 +315,8 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
 	return veorq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+	return vec_xor(v1, v2);
 #elif defined(USE_SSE)
 	return _mm_xor_si128(v1, v2);
 #endif
@@ -300,6 +332,8 @@ saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+	return vec_subs(v1, v2);
 #elif defined(USE_SSE)
 	return _mm_subs_epu8(v1, v2);
 #endif
@@ -313,6 +347,9 @@ shift_right(const pg_u8x16_t v, const int n)
 {
 #if defined(USE_NEON)
 	return vshrq_n_u8(v, n);
+#elif defined(USE_ALTIVEC)
+	/* XXX is there a shift right with a single value for n? */
+	return vec_sr(v, splat(n));
 #elif defined(USE_SSE)
 	/*
 	 * There is no intrinsic to do this on 8-bit lanes, so shift right in each
@@ -326,6 +363,16 @@ shift_right(const pg_u8x16_t v, const int n)
 #endif
 }
 
+/*
+ * For little endian machines, the prevN functions need to do byte swapping
+ * here.  Is there a way to avoid this?
+ */
+#ifdef WORDS_BIGENDIAN
+#define rev(a) (a)
+#else
+#define rev(a) ((pg_u8x16_t) (vec_reve((_v16qu) (a))))
+#endif
+
 /*
  * Shift entire 'input' register right by N 8-bit lanes, and
  * replace the first N lanes with the last N lanes from the
@@ -341,6 +388,8 @@ prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
 	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_ALTIVEC)
+	return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 1));
 #elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 #endif
@@ -351,6 +400,8 @@ prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
 	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_ALTIVEC)
+	return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 2));
 #elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 #endif
@@ -361,6 +412,8 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
 	return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_ALTIVEC)
+	return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 3));
 #elif defined(USE_SSE)
 	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 #endif
@@ -375,6 +428,8 @@ lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
 #if defined(USE_NEON)
 	return vqtbl1q_u8(lookup, input);
+#elif defined(USE_ALTIVEC)
+	return vec_perm(lookup, vzero(), input);
 #elif defined(USE_SSE)
 	return _mm_shuffle_epi8(lookup, input);
 #endif
@@ -488,6 +543,8 @@ to_bool(const pg_u8x16_t v)
 {
 #if defined(USE_NEON)
 	return vmaxvq_u8(v) != 0;
+#elif defined(USE_ALTIVEC)
+	return !vec_all_eq(v, vzero());
 #elif defined(USE_SSE)
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -503,6 +560,8 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
 #if defined(USE_NEON)
 	const		pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_ALTIVEC)
+	const		pg_u8x16_t cmp = (pg_u8x16_t) vec_cmpeq(v, vzero());
 #elif defined(USE_SSE)
 	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 #endif
@@ -516,6 +575,8 @@ is_highbit_set(const pg_u8x16_t v)
 {
 #if defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
+#elif defined(USE_ALTIVEC)
+	return to_bool(vec_and(v, splat(0x80)));
 #elif defined(USE_SSE)
 	return _mm_movemask_epi8(v) != 0;
 #endif
-- 
2.30.2

#63

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Thomas Munro (#62)

Re: [POC] verifying UTF-8 using SIMD instructions

On Wed, Jul 21, 2021 at 8:08 PM Thomas Munro <thomas.munro@gmail.com> wrote:

On Thu, Jul 22, 2021 at 6:16 AM John Naylor

One question is whether this "one size fits all" approach will be
extensible to wider SIMD.

Sure, it'll just take a little more work and complexity. For one, 16-byte
SIMD can operate on 32-byte chunks with a bit of repetition:

-       __m128i         input;
+       __m128i         input1;
+       __m128i         input2;

-#define SIMD_STRIDE_LENGTH (sizeof(__m128i))
+#define SIMD_STRIDE_LENGTH 32

        while (len >= SIMD_STRIDE_LENGTH)
        {
-               input = vload(s);
+               input1 = vload(s);
+               input2 = vload(s + sizeof(input1));

-               check_for_zeros(input, &error);
+               check_for_zeros(input1, &error);
+               check_for_zeros(input2, &error);

                /*
                 * If the chunk is all ASCII, we can skip the full UTF-8
check, but we
@@ -460,17 +463,18 @@ pg_validate_utf8_sse42(const unsigned char *s, int
len)
                 * sequences at the end. We only update prev_incomplete if
the chunk
                 * contains non-ASCII, since the error is cumulative.
                 */
-               if (is_highbit_set(input))
+               if (is_highbit_set(bitwise_or(input1, input2)))
                {
-                       check_utf8_bytes(prev, input, &error);
-                       prev_incomplete = is_incomplete(input);
+                       check_utf8_bytes(prev, input1, &error);
+                       check_utf8_bytes(input1, input2, &error);
+                       prev_incomplete = is_incomplete(input2);
                }
                else
                {
                        error = bitwise_or(error, prev_incomplete);
                }

- prev = input;
+ prev = input2;
s += SIMD_STRIDE_LENGTH;
len -= SIMD_STRIDE_LENGTH;
}

So with a few #ifdefs, we can accommodate two sizes if we like.

For another, the prevN() functions would need to change, at least on x86 --
that would require replacing _mm_alignr_epi8() with _mm256_alignr_epi8()
plus _mm256_permute2x128_si256(). Also, we might have to do something with
the vector typedef.

That said, I think we can punt on that until we have an application that's
much more compute-intensive. As it is with SSE4, COPY FROM WHERE <selective
predicate> already pushes the utf8 validation way down in profiles.

FWIW here are some performance results from my humble RPI4:

master:

chinese | mixed | ascii
---------+-------+-------
4172 | 2763 | 1823
(1 row)

Your v15 patch:

chinese | mixed | ascii
---------+-------+-------
2267 | 1248 | 399
(1 row)

Your v15 patch set + the NEON patch, configured with USE_UTF8_SIMD=1:

chinese | mixed | ascii
---------+-------+-------
909 | 620 | 318
(1 row)

It's so good I wonder if it's producing incorrect results :-)

Nice! If it passes regression tests, it *should* be fine, but stress
testing would be welcome on any platform.

I also tried to do a quick and dirty AltiVec patch to see if it could
fit into the same code "shape", with less immediate success: it works
out slower than the fallback code on the POWER7 machine I scrounged an
account on. I'm not sure what's wrong there, but maybe it's a uesful
start (I'm probably confused about endianness, or the encoding of
boolean vectors which may be different (is true 0x01or 0xff, does it
matter?), or something else, and it's falling back on errors all the
time?).

Hmm, I have access to a power8 machine to play with, but I also don't mind
having some type of server-class hardware that relies on the recent nifty
DFA fallback, which performs even better on powerpc64le than v15.

--
John Naylor
EDB: http://www.enterprisedb.com

#64

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#60)

1 attachment(s)

Re: speed up verifying UTF-8

Attached is v20, which has a number of improvements:

1. Cleaned up and explained DFA coding.
2. Adjusted check_ascii to return bool (now called is_valid_ascii) and to
produce an optimized loop, using branch-free accumulators. That way, it
doesn't need to be rewritten for different input lengths. I also think it's
a bit easier to understand this way.
3. Put SSE helper functions in their own file.
4. Mostly-cosmetic edits to the configure detection.
5. Draft commit message.

With #2 above in place, I wanted to try different strides for the DFA, so
more measurements (hopefully not much more of these):

Power8, gcc 4.8

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
2944 | 1523 | 871 | 1473 | 1509

v20, 8-byte stride:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1189 | 550 | 246 | 600 | 936

v20, 16-byte stride (in the actual patch):
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
981 | 440 | 134 | 791 | 820

v20, 32-byte stride:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
857 | 481 | 141 | 834 | 839

Based on the above, I decided that 16 bytes had the best overall balance.
Other platforms may differ, but I don't expect it to make a huge amount of
difference.

Just for fun, I was also a bit curious about what Vladimir mentioned
upthread about x86-64-v3 offering a different shift instruction. Somehow,
clang 12 refused to build with that target, even though the release notes
say it can, but gcc 11 was fine:

x86 Macbook, gcc 11, USE_FALLBACK_UTF8=1:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1200 | 728 | 370 | 544 | 637

v20:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
459 | 243 | 77 | 424 | 440

v20, CFLAGS="-march=x86-64-v3 -O2" :
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
390 | 215 | 77 | 303 | 323

And, gcc does generate the desired shift here:

objdump -S src/port/pg_utf8_fallback.o | grep shrx
53: c4 e2 eb f7 d1 shrxq %rdx, %rcx, %rdx

While it looks good, clang can do about as good by simply unrolling all 16
shifts in the loop, which gcc won't do. To be clear, it's irrelevant, since
x86-64-v3 includes AVX2, and if we had that we would just use it with the
SIMD algorithm.

Macbook x86, clang 12:

HEAD:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
974 | 691 | 370 | 456 | 526

v20, USE_FALLBACK_UTF8=1:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
351 | 172 | 88 | 349 | 350

v20, with SSE4:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
142 | 92 | 59 | 141 | 141

I'm pretty happy with the patch at this point.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v20-0001-Add-a-fast-path-for-validating-UTF-8-text.patchapplication/x-patch; name=v20-0001-Add-a-fast-path-for-validating-UTF-8-text.patchDownload

From c82cbcf342f986396152a743a552626757b0a2b3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 25 Jul 2021 20:41:41 -0400
Subject: [PATCH v20] Add a fast path for validating UTF-8 text

Our previous validator is a traditional one that performs comparisons
and branching on one byte at a time. It's useful in that we always know
exactly how many bytes we have validated, but that precision comes at
a cost. Input validation can show up prominently in profiles of COPY
FROM, and future improvements to COPY FROM such as parallelism or line
and field parsing will put more pressure on input validation. Hence,
supplement with two fast path implementations:

On machines that support SSE4, use an algorithm described in the
paper "Validating UTF-8 In Less Than One Instruction Per Byte" by
John Keiser and Daniel Lemire. The authors have made available an
open source implementation within the simdjson library (Apache 2.0
license). The lookup tables and naming conventions were adopted from
this library, but the code was written from scratch.

On other hardware, use a "shift-based" DFA.

Both implementations are heavily optimized for blocks of ASCII text,
are relatively free of branching and thus robust against many kinds
of byte patterns, and delay error checking to the very end. With these
algorithms, UTF-8 validation is from anywhere from two to seven times
faster, depending on platform and the distribution of byte sequences
in the input.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 112 ++++++--
 configure.ac                             |  61 +++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  36 +++
 src/include/mb/pg_wchar.h                |   7 +
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_sse42_utils.h        | 163 +++++++++++
 src/include/port/pg_utf8.h               |  98 +++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 250 ++++++++++++++++
 src/port/pg_utf8_sse42.c                 | 347 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++++
 src/test/regress/expected/conversion.out | 112 ++++++++
 src/test/regress/sql/conversion.sql      |  81 ++++++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 17 files changed, 1344 insertions(+), 44 deletions(-)
 create mode 100644 src/include/port/pg_sse42_utils.h
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..49d592a53c 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# Also, check for support of x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We might be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present if
+# SSE 4.2 intrinsics are.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index e468def49e..bb5e15ce41 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,59 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 39666f9727..2431565760 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
+# First check if these intrinsics can be used with the default
+# compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,49 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use the fallback.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8f05840821..f54433933b 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..bf13fa6515 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1761,7 +1762,42 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			valid_bytes = 0;
 
+	/*
+	 * For longer strings, dispatch to an optimized implementation.
+	 *
+	 * The threshold is somewhat arbitrary. XXX: If you change this, you must
+	 * change the tests in conversion.sql to match!
+	 * WIP: test different thesholds?
+	 */
+	if (len >= 32)
+	{
+		/* platform-specific implementation in src/port */
+		valid_bytes = UTF8_VERIFYSTR_FAST(s, len);
+		s += valid_bytes;
+		len -= valid_bytes;
+
+		/*
+		 * When checking multiple bytes at a time, it's possible to end within
+		 * a multibyte sequence, which wouldn't have raised an error above.
+		 * Before checking the remaining bytes, first walk backwards to find
+		 * the last byte that could have been the start of a valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) ||
+				IS_UTF8_2B_LEAD(*s) ||
+				IS_UTF8_3B_LEAD(*s) ||
+				IS_UTF8_4B_LEAD(*s))
+				break;
+		}
+	}
+
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..045bbbcb7e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -29,6 +29,13 @@ typedef unsigned int pg_wchar;
  */
 #define MAX_MULTIBYTE_CHAR_LEN	4
 
+/*
+ * UTF-8 macros
+ */
+#define IS_UTF8_2B_LEAD(c) (((c) & 0xe0) == 0xc0)
+#define IS_UTF8_3B_LEAD(c) (((c) & 0xf0) == 0xe0)
+#define IS_UTF8_4B_LEAD(c) (((c) & 0xf8) == 0xf0)
+
 /*
  * various definitions for EUC
  */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 783b8fc1ba..6d759145a8 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -898,6 +898,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use the fallback. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -935,6 +938,12 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_sse42_utils.h b/src/include/port/pg_sse42_utils.h
new file mode 100644
index 0000000000..deafb3e5f8
--- /dev/null
+++ b/src/include/port/pg_sse42_utils.h
@@ -0,0 +1,163 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sse42_utils.h
+ *	  Convenience functions to wrap SSE 4.2 intrinsics.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_sse42_utils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SSE42_UTILS
+#define PG_SSE42_UTILS
+
+#include <nmmintrin.h>
+
+
+/* assign the arguments to the lanes in the register */
+#define vset(...)       _mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift
+ * right in each 16-bit lane then apply a mask in each 8-bit
+ * lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+#endif							/* PG_SSE42_UTILS */
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..dcecfed9e2
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use SSE 4.2 instructions. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/* Use SSE 4.2 instructions, but perform a runtime check first. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+/* Use a portable implementation */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following are visible everywhere. */
+
+/*
+ * Verify a chunk of bytes for valid ASCII including a zero-byte check.
+ * This is here in case non-UTF8 encodings want to use it.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check for any set high bits in the high bit accumulator. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/*
+	 * Check if all bytes in the zero accumulator still have the high bit set.
+	 * XXX: This check is only valid after checking the high bit accumulator,
+	 * as noted above.
+	 */
+	if (zero_cum == UINT64CONST(0x8080808080808080))
+		return true;
+	else
+		return false;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..579b67a288
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,250 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+/*
+ * The fallback UTF-8 validator uses a "shift-based" DFA as described by Per
+ * Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a traditional table-driven DFA, the input byte and current state are
+ * used to compute the index into an array of state transitions. Since the
+ * load address is dependent on earlier work, the CPU is not kept busy.
+ *
+ * Now, in a shift-based DFA, the input byte is an index into array of
+ * integers that encode the state transitions. To retrieve the current state,
+ * you simply shift the integer by the current state and apply a mask. In
+ * this scheme, loads only depend on the input byte, so there is better
+ * piplining.
+ *
+ * The naming conventions, but not code, in this file are adopted from an
+ * implementation (not shift-based) of a UTF-8 to UTF-16/32 transcoder, whose
+ * table follows:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp
+ *
+ * Compared to the orginal, ERR and BGN/END are switched to make the shift
+ * encodings simpler. ERR is lower case in the table for readability.
+ *
+ * ILL  NZA  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * =========================================================================
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN|END
+ *
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
+ *
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
+ *
+ * The states and categories are spelled out below.
+ */
+
+/*
+ * With six bits per state, the mask is 63, whose importance is described
+ * later.
+ */
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+/*
+ * This determines how much to advance the string pointer each time per loop.
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+/* possible transition states for the DFA */
+
+/* Invalid state */
+#define	ERR UINT64CONST(0)
+/* Begin */
+#define	BGN (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)
+/* Partial 3-byte sequence states */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)
+/* Partial 4-byte sequence states */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)
+/* Begin and End are the same state */
+#define	END BGN
+
+/*
+ * The byte categories are 64-bit integers that encode within them the state
+ * transitions. Shifting by the current state gives the next state.
+ */
+
+/* invalid byte */
+#define ILL ERR
+
+/* non-zero ASCII */
+#define NZA (END << BGN)
+
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+
+/* map an input byte to its byte category */
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL,
+};
+
+static inline uint64
+utf8_advance(const unsigned char *s, uint64 state, int len)
+{
+	/* Note: We deliberately don't track the state within the loop. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 63: In most instruction sets,
+		 * a shift by a 64-bit operand is understood to be a shift by its mod
+		 * 64, so the compiler should elide the mask operation.
+		 */
+		state = ByteCategory[*s++] >> (state & DFA_MASK);
+		len--;
+	}
+
+	return state & DFA_MASK;
+}
+
+/*
+ * Returns the string length if valid, or zero on error.
+ *
+ * If valid, it's still possible we ended within an incomplete multibyte
+ * sequence, so the caller is responsible for adjusting the returned result
+ * to make sure it represents the end of the last valid byte sequence.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const int	orig_len = len;
+	uint64		state = BGN;
+
+	while (len >= STRIDE_LENGTH)
+	{
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check for a non-END state, which means the previous
+		 * chunk ended in the middle of a multibyte sequence.
+		 */
+		if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+			state = utf8_advance(s, state, STRIDE_LENGTH);
+
+		s += STRIDE_LENGTH;
+		len -= STRIDE_LENGTH;
+	}
+
+	/*
+	 * Check remaining bytes.
+	 *
+	 * XXX: Since we pass s and len by value, they are no longer meaningful
+	 * after this point, but that's okay, because we know we're at the end.
+	 */
+	utf8_advance(s, state, len);
+
+	/*
+	 * If we saw an error during the loop, let the caller handle it. We treat
+	 * all other states as success.
+	 */
+	if (state == ERR)
+		return 0;
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..78afbfe6ac
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,347 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_sse42_utils.h"
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire:
+ *
+ * https://arxiv.org/pdf/2010.03090.pdf
+ *
+ * The authors provide an implementation of this algorithm in the simdjson
+ * library (Apache 2.0 license):
+ *
+ * https://github.com/simdjson/simdjson.
+ *
+ * The PG code was written from scratch, but with some naming conventions
+ * adapted from the Westmere implementation of simdjson. The constants and
+ * lookup tables were taken directly from simdjson with some cosmetic
+ * rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking at
+ * the first three nibbles of each overlapping 2-byte sequence, using three
+ * separate lookup tables. The interesting bytes are either definite errors
+ * or two continuation bytes in a row. The latter may be valid depending on
+ * what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+/* constants for comparing bytes at the end of a vector */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_high = shift_right(input_shift1, 4);
+	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_high = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+	return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes with the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Returns the number of bytes validated, or zero on error.
+ *
+ * If valid, it's still possible we ended within an incomplete multibyte
+ * sequence, so the caller is responsible for adjusting the returned result
+ * to make sure it represents the end of the last valid byte sequence. In
+ * addition, the returned length can only be a multiple of register-width, so
+ * the caller must verify any remaining bytes.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len >= sizeof(input))
+	{
+		input = vload(s);
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII.
+		 */
+		if (is_highbit_set(input))
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+		else
+			error = bitwise_or(error, prev_incomplete);
+
+		prev = input;
+		s += sizeof(input);
+		len -= sizeof(input);
+	}
+
+	/* If we saw an error during the loop, let the caller handle it. */
+	if (to_bool(error))
+		return 0;
+	else
+		return s - start;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..973fe69225
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports SSE 4.2.
+ * If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+
+	/*
+	 * XXX The equivalent check for CRC throws an error here because it
+	 * detects CPUID presence at configure time. This is to avoid indirecting
+	 * through a function pointer, but that's not important for UTF-8.
+	 */
+	return false;
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int			(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..c318fee1b9 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,118 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more ASCII.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..bce39e5296 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,87 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII fast path with cases where incomplete UTF-8 sequences
+-- fall at the end of a 16-byte boundary followed by more ASCII.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 233ddbf4c2..9b8bad9044 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index fcb43b0ca0..db7a84e30a 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -490,6 +490,7 @@ sub GenerateFiles
 		USE_ASSERT_CHECKING => $self->{options}->{asserts} ? 1 : undef,
 		USE_BONJOUR         => undef,
 		USE_BSD_AUTH        => undef,
+		USE_FALLBACK_UTF8          => undef,
 		USE_ICU => $self->{options}->{icu} ? 1 : undef,
 		USE_LIBXML                 => undef,
 		USE_LIBXSLT                => undef,
@@ -502,6 +503,8 @@ sub GenerateFiles
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SYSTEMD                         => undef,
 		USE_SYSV_SEMAPHORES                 => undef,
 		USE_SYSV_SHARED_MEMORY              => undef,
-- 
2.31.1

#65

Vladimir Sitnikov

sitnikov.vladimir@gmail.com

over 4 years ago

In reply to: John Naylor (#64)

Re: speed up verifying UTF-8

Just wondering, do you have the code in a GitHub/Gitlab branch?

+ utf8_advance(s, state, len);
+
+ /*
+ * If we saw an error during the loop, let the caller handle it. We treat
+ * all other states as success.
+ */
+ if (state == ERR)
+ return 0;

Did you mean state = utf8_advance(s, state, len); there? (reassign state
variable)

I wanted to try different strides for the DFA

Does that (and "len >= 32" condition) mean the patch does not improve
validation of the shorter strings (the ones less than 32 bytes)?
It would probably be nice to cover them as well (e.g. with 4 or 8-byte
strides)

Vladimir

#66

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Vladimir Sitnikov (#65)

Re: speed up verifying UTF-8

On Mon, Jul 26, 2021 at 7:55 AM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

Just wondering, do you have the code in a GitHub/Gitlab branch?
+ utf8_advance(s, state, len);
+
+ /*
+ * If we saw an error during the loop, let the caller handle it. We

treat

+ * all other states as success.
+ */
+ if (state == ERR)
+ return 0;
Did you mean state = utf8_advance(s, state, len); there? (reassign state

variable)

Yep, that's a bug, thanks for catching!

I wanted to try different strides for the DFA

Does that (and "len >= 32" condition) mean the patch does not improve

validation of the shorter strings (the ones less than 32 bytes)?

Right. Also, the 32 byte threshold was just a temporary need for testing
32-byte stride -- testing different thresholds wouldn't hurt. I'm not
terribly concerned about short strings, though, as long as we don't
regress. That said, Heikki had something in his v14 [1]/messages/by-id/2f95e70d-4623-87d4-9f24-ca534155f179@iki.fi -- John Naylor EDB: http://www.enterprisedb.com that we could use:

+/*
+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the length
of the
+ * character at *s in bytes, or 0 on invalid input or premature end of
input.
+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)

It would be easy to replace pg_utf8_verifychar with this. It might even
speed up the SQL function length_in_encoding() -- that would be a better
reason to do it.

[1]: /messages/by-id/2f95e70d-4623-87d4-9f24-ca534155f179@iki.fi -- John Naylor EDB: http://www.enterprisedb.com
/messages/by-id/2f95e70d-4623-87d4-9f24-ca534155f179@iki.fi
--
John Naylor
EDB: http://www.enterprisedb.com

#67

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: Vladimir Sitnikov (#65)

Re: speed up verifying UTF-8

On Mon, Jul 26, 2021 at 7:55 AM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

Just wondering, do you have the code in a GitHub/Gitlab branch?

Sorry, I didn't see this earlier. No, I don't.
--
John Naylor
EDB: http://www.enterprisedb.com

#68

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#66)

1 attachment(s)

Re: speed up verifying UTF-8

I wrote:

On Mon, Jul 26, 2021 at 7:55 AM Vladimir Sitnikov <

sitnikov.vladimir@gmail.com> wrote:

+ utf8_advance(s, state, len);
+
+ /*
+ * If we saw an error during the loop, let the caller handle it. We

treat

+ * all other states as success.
+ */
+ if (state == ERR)
+ return 0;
Did you mean state = utf8_advance(s, state, len); there? (reassign

state variable)

Yep, that's a bug, thanks for catching!

Fixed in v21, with a regression test added. Also, utf8_advance() now
directly changes state by a passed pointer rather than returning a value.
Some cosmetic changes:

s/valid_bytes/non_error_bytes/ since the former is kind of misleading now.

Some other var name and symbol changes. In my first DFA experiment, ASC
conflicted with the parser or scanner somehow, but it doesn't here, so it's
clearer to use this.

Rewrote a lot of comments about the state machine and regression tests.
--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v21-0001-Add-fast-paths-for-validating-UTF-8-text.patchapplication/octet-stream; name=v21-0001-Add-fast-paths-for-validating-UTF-8-text.patchDownload

From 2c15119733c7eb67ab724882a0a80eb3039f8096 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 28 Jul 2021 13:53:06 -0400
Subject: [PATCH v21] Add fast paths for validating UTF-8 text

Our previous validator is a traditional one that performs comparisons
and branching on one byte at a time. It's useful in that we always know
exactly how many bytes we have validated, but that precision comes at
a cost. Input validation can show up prominently in profiles of COPY
FROM, and future improvements to COPY FROM such as parallelism or
faster line parsing will put more pressure on input validation. Hence,
supplement with two fast paths, depending on platform:

On machines that support SSE4, use an algorithm described in the
paper "Validating UTF-8 In Less Than One Instruction Per Byte" by
John Keiser and Daniel Lemire. The authors have made available an
open source implementation within the simdjson library (Apache 2.0
license). The lookup tables and some naming conventions were adopted
from this library, but the code was written from scratch.

On other hardware, use a "shift-based" DFA.

Both implementations are highly optimized for blocks of ASCII text,
are relatively free of branches and thus robust against all kinds of
byte patterns, and delay error checking to the very end. With these
algorithms, UTF-8 validation is from anywhere from two to seven times
faster, depending on platform and the distribution of byte sequences
in the input.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 112 ++++++--
 configure.ac                             |  61 +++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  38 +++
 src/include/mb/pg_wchar.h                |   7 +
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_sse42_utils.h        | 163 +++++++++++
 src/include/port/pg_utf8.h               |  99 +++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 245 ++++++++++++++++
 src/port/pg_utf8_sse42.c                 | 347 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++++
 src/test/regress/expected/conversion.out | 168 +++++++++++
 src/test/regress/sql/conversion.sql      | 132 +++++++++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 17 files changed, 1449 insertions(+), 44 deletions(-)
 create mode 100644 src/include/port/pg_sse42_utils.h
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..49d592a53c 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# Also, check for support of x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We might be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present if
+# SSE 4.2 intrinsics are.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index e468def49e..bb5e15ce41 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18177,12 +18184,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18249,6 +18256,59 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 39666f9727..2431565760 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
+# First check if these intrinsics can be used with the default
+# compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2163,6 +2163,49 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use the fallback.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8f05840821..f54433933b 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..6f26aa7613 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1761,7 +1762,44 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			non_error_bytes = 0;
 
+	/*
+	 * For longer strings, dispatch to an optimized implementation.
+	 *
+	 * The threshold is somewhat arbitrary. WIP: test different thresholds?
+	 * XXX: Some regression tests in conversion.sql need to pad their inputs
+	 * by a multiple of this number in order to test the fast path, so if you
+	 * change this, make sure that's still the case!
+	 */
+	if (len >= 32)
+	{
+		/* platform-specific implementation in src/port */
+		non_error_bytes = UTF8_VERIFYSTR_FAST(s, len);
+		s += non_error_bytes;
+		len -= non_error_bytes;
+
+		/*
+		 * Since the fast path is optimized for the valid case, it's possible
+		 * it returned in the middle of a multibyte sequence, since that
+		 * wouldn't have raised an error. Before checking the remaining bytes,
+		 * walk backwards to find the last byte that could have been the start
+		 * of a valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) ||
+				IS_UTF8_2B_LEAD(*s) ||
+				IS_UTF8_3B_LEAD(*s) ||
+				IS_UTF8_4B_LEAD(*s))
+				break;
+		}
+	}
+
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..045bbbcb7e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -29,6 +29,13 @@ typedef unsigned int pg_wchar;
  */
 #define MAX_MULTIBYTE_CHAR_LEN	4
 
+/*
+ * UTF-8 macros
+ */
+#define IS_UTF8_2B_LEAD(c) (((c) & 0xe0) == 0xc0)
+#define IS_UTF8_3B_LEAD(c) (((c) & 0xf0) == 0xe0)
+#define IS_UTF8_4B_LEAD(c) (((c) & 0xf8) == 0xf0)
+
 /*
  * various definitions for EUC
  */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 783b8fc1ba..6d759145a8 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -898,6 +898,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use the fallback. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -935,6 +938,12 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_sse42_utils.h b/src/include/port/pg_sse42_utils.h
new file mode 100644
index 0000000000..deafb3e5f8
--- /dev/null
+++ b/src/include/port/pg_sse42_utils.h
@@ -0,0 +1,163 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sse42_utils.h
+ *	  Convenience functions to wrap SSE 4.2 intrinsics.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_sse42_utils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SSE42_UTILS
+#define PG_SSE42_UTILS
+
+#include <nmmintrin.h>
+
+
+/* assign the arguments to the lanes in the register */
+#define vset(...)       _mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift
+ * right in each 16-bit lane then apply a mask in each 8-bit
+ * lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+#endif							/* PG_SSE42_UTILS */
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..4ce656e504
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use SSE 4.2 instructions. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/* Use SSE 4.2 instructions, but perform a runtime check first. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+/* Use a portable implementation */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following are visible everywhere. */
+
+/*
+ * Verify a chunk of bytes for valid ASCII including a zero-byte check.
+ * This is here in case non-UTF8 encodings want to use it.
+ * WIP: Is there a better place for it?
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check for any set high bits in the high bit accumulator. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/*
+	 * Check if all bytes in the zero accumulator still have the high bit set.
+	 * XXX: This check is only valid after checking the high bit accumulator,
+	 * as noted above.
+	 */
+	if (zero_cum == UINT64CONST(0x8080808080808080))
+		return true;
+	else
+		return false;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..717657ceaf
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,245 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * This determines how much to advance the string pointer each time per loop.
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+/*
+ * With six bits per state, the mask is 63, whose importance is described
+ * later.
+ */
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+
+/*
+ * The fallback UTF-8 validator uses a "shift-based" DFA as described by Per
+ * Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a traditional table-driven DFA, the input byte and current state are
+ * used to compute the index into an array of state transitions. Since the
+ * load address is dependent on earlier work, the CPU is not kept busy.
+ *
+ * Now, in a shift-based DFA, the input byte is an index into array of
+ * integers that encode the state transitions. To retrieve the current state,
+ * you simply shift the integer by the current state and apply a mask. In
+ * this scheme, loads only depend on the input byte, so there is better
+ * pipelining.
+ *
+ * The naming conventions, but not code, in this file are adopted from the
+ * following UTF-8 to UTF-16/32 transcoder, which uses a traditional DFA:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * =========================================================================
+ * err, END, bct, bct, bct, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *
+ * err, err, tol, tol, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, sur, err, err, err, err, err, err, err,      | P3B
+ *
+ * err, err, fol, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, ftl, ftl, err, err, err, err, err, err, err,      | P4B
+ *
+ * Error states are lower case in the table for readability. The table as
+ * copied here adds more detail on some of the error states, but this is
+ * just for clarity -- it makes no difference in the coding:
+ *
+ * bct = bare continuation
+ * tol = three-byte overlong
+ * sur = surrogate
+ * fol = four-byte overlong
+ * ftl = four-byte too large
+ */
+
+/* Invalid state */
+#define	ERR  UINT64CONST(0)
+/* Begin */
+#define	BGN (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)	/* expect 1 cont. byte */
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)	/* expect 2 cont. bytes */
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)	/* expect 3 cont. bytes */
+/* Partial 3-byte sequence states */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)	/* leading byte was E0 */
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)	/* leading byte was ED */
+/* Partial 4-byte sequence states */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)	/* leading byte was F0 */
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)	/* leading byte was F4 */
+/* Begin and End are the same state */
+#define	END BGN
+
+/*
+ * The byte categories are 64-bit integers that encode within them the state
+ * transitions. Shifting by the current state gives the next state.
+ */
+
+/* invalid byte */
+#define ILL ERR
+/* ASCII */
+#define ASC (END << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+
+/* map an input byte to its byte category */
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static inline void
+utf8_advance(const unsigned char *s, uint64 *state, int len)
+{
+	/* Note: We deliberately don't check the state within the loop. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 63: In most instruction sets,
+		 * a shift by a 64-bit operand is understood to be a shift by its mod
+		 * 64, so the compiler should elide the mask operation.
+		 */
+		*state = ByteCategory[*s++] >> (*state & DFA_MASK);
+		len--;
+	}
+
+	*state &= DFA_MASK;
+}
+
+/*
+ * Returns zero on error, or the string length if no errors were detected.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const int	orig_len = len;
+	uint64		state = BGN;
+
+	while (len >= STRIDE_LENGTH)
+	{
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must first check for a non-END state, which means the previous
+		 * chunk ended in the middle of a multibyte sequence.
+		 */
+		if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+			utf8_advance(s, &state, STRIDE_LENGTH);
+
+		s += STRIDE_LENGTH;
+		len -= STRIDE_LENGTH;
+	}
+
+	/* check remaining bytes */
+	utf8_advance(s, &state, len);
+
+	/*
+	 * If we saw an error during the loop, let the caller handle it. We treat
+	 * all other states as success.
+	 */
+	if (state == ERR)
+		return 0;
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..9a74fcbdd0
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,347 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_sse42_utils.h"
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire:
+ *
+ * https://arxiv.org/pdf/2010.03090.pdf
+ *
+ * The authors provide an implementation of this algorithm in the simdjson
+ * library (Apache 2.0 license):
+ *
+ * https://github.com/simdjson/simdjson
+ *
+ * The PG code was written from scratch, but with some naming conventions
+ * adapted from the Westmere implementation of simdjson. The constants and
+ * lookup tables were taken directly from simdjson with some cosmetic
+ * rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking at
+ * the first three nibbles of each overlapping 2-byte sequence, using three
+ * separate lookup tables. The interesting bytes are either definite errors
+ * or two continuation bytes in a row. The latter may be valid depending on
+ * what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+/* constants for comparing bytes at the end of a vector */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_hi_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_lo_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_hi_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_hi = shift_right(input_shift1, 4);
+	const		__m128i byte_1_lo = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_hi = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_hi = lookup(byte_1_hi, byte_1_hi_table);
+	const		__m128i lookup_1_lo = lookup(byte_1_lo, byte_1_lo_table);
+	const		__m128i lookup_2_hi = lookup(byte_2_hi, byte_2_hi_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_hi, lookup_1_lo);
+
+	return bitwise_and(temp, lookup_2_hi);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes with the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_TWO_BYTE_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i set_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(special_cases, set_two_conts);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Returns zero on error, or the number of bytes processed if no errors were
+ * detected.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	__m128i		error = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len >= sizeof(input))
+	{
+		input = vload(s);
+		check_for_zeros(input, &error);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII.
+		 */
+		if (is_highbit_set(input))
+		{
+			check_utf8_bytes(prev, input, &error);
+			prev_incomplete = is_incomplete(input);
+		}
+		else
+			error = bitwise_or(error, prev_incomplete);
+
+		prev = input;
+		s += sizeof(input);
+		len -= sizeof(input);
+	}
+
+	/* If we saw an error during the loop, let the caller handle it. */
+	if (to_bool(error))
+		return 0;
+	else
+		return s - start;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..973fe69225
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports SSE 4.2.
+ * If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+
+	/*
+	 * XXX The equivalent check for CRC throws an error here because it
+	 * detects CPUID presence at configure time. This is to avoid indirecting
+	 * through a function pointer, but that's not important for UTF-8.
+	 */
+	return false;
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int			(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..0167a8d0c9 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,174 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..f6215a5768 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,138 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 32)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 32 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 233ddbf4c2..9b8bad9044 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index fcb43b0ca0..db7a84e30a 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -490,6 +490,7 @@ sub GenerateFiles
 		USE_ASSERT_CHECKING => $self->{options}->{asserts} ? 1 : undef,
 		USE_BONJOUR         => undef,
 		USE_BSD_AUTH        => undef,
+		USE_FALLBACK_UTF8          => undef,
 		USE_ICU => $self->{options}->{icu} ? 1 : undef,
 		USE_LIBXML                 => undef,
 		USE_LIBXSLT                => undef,
@@ -502,6 +503,8 @@ sub GenerateFiles
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SYSTEMD                         => undef,
 		USE_SYSV_SEMAPHORES                 => undef,
 		USE_SYSV_SHARED_MEMORY              => undef,
-- 
2.31.1

#69

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#66)

1 attachment(s)

Re: speed up verifying UTF-8

On Mon, Jul 26, 2021 at 8:56 AM John Naylor <john.naylor@enterprisedb.com>
wrote:

Does that (and "len >= 32" condition) mean the patch does not improve

validation of the shorter strings (the ones less than 32 bytes)?

Right. Also, the 32 byte threshold was just a temporary need for testing

32-byte stride -- testing different thresholds wouldn't hurt. I'm not
terribly concerned about short strings, though, as long as we don't
regress.

I put together the attached quick test to try to rationalize the fast-path
threshold. (In case it isn't obvious, it must be at least 16 on all builds,
since wchar.c doesn't know which implementation it's calling, and SSE
register width sets the lower bound.) I changed the threshold first to 16,
and then 100000, which will force using the byte-at-a-time code.

If we have only 16 bytes in the input, it still seems to be faster to use
SSE, even though it's called through a function pointer on x86. I didn't
test the DFA path, but I don't think the conclusion would be different.
I'll include the 16 threshold next time I need to update the patch.

Macbook x86, clang 12:

master + use 16:
asc16 | asc32 | asc64 | mb16 | mb32 | mb64
-------+-------+-------+------+------+------
270 | 279 | 282 | 291 | 296 | 304

force byte-at-a-time:
asc16 | asc32 | asc64 | mb16 | mb32 | mb64
-------+-------+-------+------+------+------
277 | 292 | 310 | 296 | 317 | 362

--
John Naylor
EDB: http://www.enterprisedb.com

#70

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#69)

1 attachment(s)

Re: speed up verifying UTF-8

I wrote:

If we have only 16 bytes in the input, it still seems to be faster to use

SSE, even though it's called through a function pointer on x86. I didn't
test the DFA path, but I don't think the conclusion would be different.
I'll include the 16 threshold next time I need to update the patch.

v22 attached, which changes the threshold to 16, with a few other cosmetic
adjustments, mostly in the comments.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v22-0001-Add-fast-paths-for-validating-UTF-8-text.patchapplication/octet-stream; name=v22-0001-Add-fast-paths-for-validating-UTF-8-text.patchDownload

From ec402b3cb2bbb7a92d2a621afd489b6feb6f88db Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 28 Jul 2021 13:53:06 -0400
Subject: [PATCH v22] Add fast paths for validating UTF-8 text

Our previous validator is a traditional one that performs comparisons
and branching on one byte at a time. It's useful in that we always know
exactly how many bytes we have validated, but that precision comes at
a cost. Input validation can show up prominently in profiles of COPY
FROM, and future improvements to COPY FROM such as parallelism or
faster line parsing will put more pressure on input validation. Hence,
supplement with two fast paths, depending on platform:

On platforms that support SSE 4.2, use an algorithm described in the
paper "Validating UTF-8 In Less Than One Instruction Per Byte" by
John Keiser and Daniel Lemire. The authors have made available an
open source implementation within the simdjson library (Apache 2.0
license). The lookup tables and some naming conventions were adopted
from this library, but the code was written from scratch.

On other platforms, use a "shift-based" DFA, which is faster than a
traditional DFA.

Both implementations are highly optimized for blocks of ASCII text,
are relatively free of branches and thus robust against all kinds of
byte patterns, and delay error checking to the very end. With these
algorithms, UTF-8 validation is from anywhere from two to seven times
faster, depending on platform and the distribution of byte sequences
in the input.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 112 ++++++--
 configure.ac                             |  61 +++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  35 +++
 src/include/mb/pg_wchar.h                |   7 +
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_sse42_utils.h        | 163 +++++++++++
 src/include/port/pg_utf8.h               | 102 +++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 246 ++++++++++++++++
 src/port/pg_utf8_sse42.c                 | 348 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++++
 src/test/regress/expected/conversion.out | 170 +++++++++++
 src/test/regress/sql/conversion.sql      | 134 +++++++++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 17 files changed, 1455 insertions(+), 44 deletions(-)
 create mode 100644 src/include/port/pg_sse42_utils.h
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..49d592a53c 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# Also, check for support of x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We might be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present if
+# SSE 4.2 intrinsics are.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 7542fe30a1..fec872fad6 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -18079,14 +18080,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18100,32 +18101,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18139,26 +18143,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18293,12 +18300,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18312,7 +18319,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18365,6 +18372,59 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index ed3cdb9a8e..cbd2d9a27b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2098,14 +2098,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
+# First check if these intrinsics can be used with the default
+# compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2146,12 +2146,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2165,7 +2165,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2202,6 +2202,49 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use the fallback.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 6e2f224cc4..a23441b36d 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -740,6 +740,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..cf5da852ed 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1761,7 +1762,41 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			non_error_bytes = 0;
 
+	/*
+	 * For all but the shortest strings, dispatch to an optimized
+	 * platform-specific implementation in src/port. The threshold is set to
+	 * the width of the widest SIMD register we support in
+	 * src/include/port/pg_sse42_utils.h.
+	 */
+	if (len >= 16)
+	{
+		non_error_bytes = UTF8_VERIFYSTR_FAST(s, len);
+		s += non_error_bytes;
+		len -= non_error_bytes;
+
+		/*
+		 * The fast path is optimized for the valid case, so it's possible it
+		 * returned in the middle of a multibyte sequence, since that wouldn't
+		 * have raised an error. Before checking the remaining bytes, walk
+		 * backwards to find the last byte that could have been the start of a
+		 * valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) ||
+				IS_UTF8_2B_LEAD(*s) ||
+				IS_UTF8_3B_LEAD(*s) ||
+				IS_UTF8_4B_LEAD(*s))
+				break;
+		}
+	}
+
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..045bbbcb7e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -29,6 +29,13 @@ typedef unsigned int pg_wchar;
  */
 #define MAX_MULTIBYTE_CHAR_LEN	4
 
+/*
+ * UTF-8 macros
+ */
+#define IS_UTF8_2B_LEAD(c) (((c) & 0xe0) == 0xc0)
+#define IS_UTF8_3B_LEAD(c) (((c) & 0xf0) == 0xe0)
+#define IS_UTF8_4B_LEAD(c) (((c) & 0xf8) == 0xf0)
+
 /*
  * various definitions for EUC
  */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 15ffdd895a..58fd420831 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use the fallback. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -941,6 +944,12 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_sse42_utils.h b/src/include/port/pg_sse42_utils.h
new file mode 100644
index 0000000000..deafb3e5f8
--- /dev/null
+++ b/src/include/port/pg_sse42_utils.h
@@ -0,0 +1,163 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sse42_utils.h
+ *	  Convenience functions to wrap SSE 4.2 intrinsics.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_sse42_utils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SSE42_UTILS
+#define PG_SSE42_UTILS
+
+#include <nmmintrin.h>
+
+
+/* assign the arguments to the lanes in the register */
+#define vset(...)       _mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift
+ * right in each 16-bit lane then apply a mask in each 8-bit
+ * lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+#endif							/* PG_SSE42_UTILS */
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..3b5b2828e0
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,102 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use SSE 4.2 instructions. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/* Use SSE 4.2 instructions, but perform a runtime check first. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+/* Use a portable implementation */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following are visible everywhere. */
+
+/*
+ * Verify a chunk of bytes for valid ASCII including a zero-byte check.
+ * This is here in case non-UTF8 encodings want to use it.
+ * WIP: Is there a better place for it?
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check if any high bits in the high bit accumulator got set. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/*
+	 * Check if any high bits in the zero accumulator got cleared.
+	 *
+	 * XXX: As noted above, the zero check is only valid if the chunk had no
+	 * high bits set. However, the compiler may put these two checks in any
+	 * order. That's okay because if any high bits were set, we would return
+	 * false regardless, so false positives from the zero check don't matter.
+	 */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;
+
+	return true;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..25e3031b48
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,246 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * This determines how much to advance the string pointer each time per loop.
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+/*
+ * With six bits per state, the mask is 63, whose importance is described
+ * later.
+ */
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+
+/*
+ * The fallback UTF-8 validator uses a "shift-based" DFA as described by Per
+ * Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a traditional table-driven DFA, the input byte and current state are
+ * used to compute an index into an array of state transitions. Since the
+ * load address is dependent on earlier work, the CPU is not kept busy.
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * that encode the state transitions. To retrieve the current state, you
+ * simply shift the integer by the current state and apply a mask. In this
+ * scheme, loads only depend on the input byte, so there is better
+ * pipelining.
+ *
+ * The naming conventions, but not code, in this file are adopted from the
+ * following UTF-8 to UTF-16/32 transcoder, which uses a traditional DFA:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, bct, bct, bct, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *                                                                  |
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *                                                                  |
+ * err, err, tol, tol, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, sur, err, err, err, err, err, err, err,      | P3B
+ *                                                                  |
+ * err, err, fol, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, ftl, ftl, err, err, err, err, err, err, err,      | P4B
+ *
+ * There is only one error state in reality, but the table shown here has
+ * some additional notional error states to explain what byte sequences lead
+ * to them -- it makes no difference in the code. Error states are lower case
+ * in the table for readability.
+ *
+ * bct = bare continuation
+ * tol = three-byte overlong
+ * sur = surrogate
+ * fol = four-byte overlong
+ * ftl = four-byte too large
+ */
+
+/* Invalid state */
+#define	ERR  UINT64CONST(0)
+/* Begin */
+#define	BGN (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)	/* expect 1 cont. byte */
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)	/* expect 2 cont. bytes */
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)	/* expect 3 cont. bytes */
+/* Partial 3-byte sequence states, expect 1 more cont. byte */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)	/* leading byte was E0 */
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)	/* leading byte was ED */
+/* Partial 4-byte sequence states, expect 2 more cont. bytes */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)	/* leading byte was F0 */
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)	/* leading byte was F4 */
+/* Begin and End are the same state */
+#define	END BGN
+
+/*
+ * The byte categories are 64-bit integers that encode within them the state
+ * transitions. Shifting by the current state gives the next state.
+ */
+
+/* invalid byte */
+#define ILL  ERR
+/* ASCII */
+#define ASC (END << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+
+/* map an input byte to its byte category */
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static inline void
+utf8_advance(const unsigned char *s, uint64 *state, int len)
+{
+	/* Note: We deliberately don't check the state within the loop. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 63: In most instruction sets,
+		 * a shift by a 64-bit operand is understood to be a shift by its mod
+		 * 64, so the compiler should elide the mask operation.
+		 */
+		*state = ByteCategory[*s++] >> (*state & DFA_MASK);
+		len--;
+	}
+
+	*state &= DFA_MASK;
+}
+
+/*
+ * Returns zero on error, or the string length if no errors were detected.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const int	orig_len = len;
+	uint64		state = BGN;
+
+	while (len >= STRIDE_LENGTH)
+	{
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must first check for a non-END state, which means the previous
+		 * chunk ended in the middle of a multibyte sequence.
+		 */
+		if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+			utf8_advance(s, &state, STRIDE_LENGTH);
+
+		s += STRIDE_LENGTH;
+		len -= STRIDE_LENGTH;
+	}
+
+	/* check remaining bytes */
+	utf8_advance(s, &state, len);
+
+	/*
+	 * If we saw an error during the loop, let the caller handle it. We treat
+	 * all other states as success.
+	 */
+	if (state == ERR)
+		return 0;
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..5389536811
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,348 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_sse42_utils.h"
+#include "port/pg_utf8.h"
+
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire:
+ *
+ * https://arxiv.org/pdf/2010.03090.pdf
+ *
+ * The authors provide an implementation of this algorithm in the simdjson
+ * library (Apache 2.0 license):
+ *
+ * https://github.com/simdjson/simdjson
+ *
+ * The PG code was written from scratch, but with some naming conventions
+ * adapted from the Westmere implementation of simdjson. The constants and
+ * lookup tables were taken directly from simdjson with some cosmetic
+ * rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking at
+ * the first three nibbles of each overlapping 2-byte sequence, using three
+ * separate lookup tables. The interesting bytes are either definite errors
+ * or two continuation bytes in a row. The latter may be valid depending on
+ * what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+#define MAX_CONT 0xBF
+#define MAX_2B_LEAD 0xDF
+#define MAX_3B_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_hi_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_lo_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_hi_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_hi = shift_right(input_shift1, 4);
+	const		__m128i byte_1_lo = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_hi = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_hi = lookup(byte_1_hi, byte_1_hi_table);
+	const		__m128i lookup_1_lo = lookup(byte_1_lo, byte_1_lo_table);
+	const		__m128i lookup_2_hi = lookup(byte_2_hi, byte_2_hi_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_hi, lookup_1_lo);
+
+	return bitwise_and(temp, lookup_2_hi);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row, namely after 3- and 4-byte leads.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes with the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_2B_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_2B_LEAD);
+	const		__m128i max_lead3 = splat(MAX_3B_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i set_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(special_cases, set_two_conts);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_3B_LEAD, MAX_2B_LEAD, MAX_CONT);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Returns zero on error, or the number of bytes processed if no errors were
+ * detected.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	__m128i		error_cum = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len >= sizeof(input))
+	{
+		input = vload(s);
+		check_for_zeros(input, &error_cum);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII.
+		 */
+		if (is_highbit_set(input))
+		{
+			check_utf8_bytes(prev, input, &error_cum);
+			prev_incomplete = is_incomplete(input);
+		}
+		else
+			error_cum = bitwise_or(error_cum, prev_incomplete);
+
+		prev = input;
+		s += sizeof(input);
+		len -= sizeof(input);
+	}
+
+	/* If we saw an error during the loop, let the caller handle it. */
+	if (to_bool(error_cum))
+		return 0;
+	else
+		return s - start;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..973fe69225
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports SSE 4.2.
+ * If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+
+	/*
+	 * XXX The equivalent check for CRC throws an error here because it
+	 * detects CPUID presence at configure time. This is to avoid indirecting
+	 * through a function pointer, but that's not important for UTF-8.
+	 */
+	return false;
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int			(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..201876a495 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,176 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be the width of the widest SIMD
+-- register we could possibly support in the forseeable future.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..6510a88b1b 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,140 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be the width of the widest SIMD
+-- register we could possibly support in the forseeable future.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index aab559e3ef..6e9c24b600 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -120,10 +120,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 165a93987a..ce04643151 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -491,6 +491,7 @@ sub GenerateFiles
 		USE_ASSERT_CHECKING => $self->{options}->{asserts} ? 1 : undef,
 		USE_BONJOUR         => undef,
 		USE_BSD_AUTH        => undef,
+		USE_FALLBACK_UTF8          => undef,
 		USE_ICU => $self->{options}->{icu} ? 1 : undef,
 		USE_LIBXML                 => undef,
 		USE_LIBXSLT                => undef,
@@ -503,6 +504,8 @@ sub GenerateFiles
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SYSTEMD                         => undef,
 		USE_SYSV_SEMAPHORES                 => undef,
 		USE_SYSV_SHARED_MEMORY              => undef,
-- 
2.31.1

#71

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#70)

2 attachment(s)

Re: speed up verifying UTF-8

Naively, the shift-based DFA requires 64-bit integers to encode the
transitions, but I recently came across an idea from Dougall Johnson of
using the Z3 SMT solver to pack the transitions into 32-bit integers [1]https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f -- John Naylor EDB: http://www.enterprisedb.com.
That halves the size of the transition table for free. I adapted that
effort to the existing conventions in v22 and arrived at the attached
python script. Running the script outputs the following:

$ python dfa-pack-pg.py
offsets: [0, 11, 16, 1, 5, 6, 20, 25, 30]
transitions:
00000000000000000000000000000000 0x0
00000000000000000101100000000000 0x5800
00000000000000001000000000000000 0x8000
00000000000000000000100000000000 0x800
00000000000000000010100000000000 0x2800
00000000000000000011000000000000 0x3000
00000000000000001010000000000000 0xa000
00000000000000001100100000000000 0xc800
00000000000000001111000000000000 0xf000
01000001000010110000000000100000 0x410b0020
00000011000010110000000000100000 0x30b0020
00000010000010110000010000100000 0x20b0420

I'll include something like the attached text file diff in the next patch.
Some comments are now outdated, but this is good enough for demonstration.

[1]: https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f -- John Naylor EDB: http://www.enterprisedb.com
--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v22-addendum-32-bit-transitions.txttext/plain; charset=US-ASCII; name=v22-addendum-32-bit-transitions.txtDownload

diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 25e3031b48..0505feffc5 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -29,7 +29,7 @@
  * With six bits per state, the mask is 63, whose importance is described
  * later.
  */
-#define DFA_BITS_PER_STATE 6
+#define DFA_BITS_PER_STATE 5
 #define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
 
 
@@ -82,19 +82,19 @@
  */
 
 /* Invalid state */
-#define	ERR  UINT64CONST(0)
+#define	ERR 0
 /* Begin */
-#define	BGN (UINT64CONST(1) * DFA_BITS_PER_STATE)
+#define	BGN 11
 /* Continuation states */
-#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)	/* expect 1 cont. byte */
-#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)	/* expect 2 cont. bytes */
-#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)	/* expect 3 cont. bytes */
+#define	CS1 16	/* expect 1 cont. byte */
+#define	CS2  1	/* expect 2 cont. bytes */
+#define	CS3  5	/* expect 3 cont. bytes */
 /* Partial 3-byte sequence states, expect 1 more cont. byte */
-#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)	/* leading byte was E0 */
-#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)	/* leading byte was ED */
+#define	P3A  6	/* leading byte was E0 */
+#define	P3B 20	/* leading byte was ED */
 /* Partial 4-byte sequence states, expect 2 more cont. bytes */
-#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)	/* leading byte was F0 */
-#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)	/* leading byte was F4 */
+#define	P4A 25	/* leading byte was F0 */
+#define	P4B 30	/* leading byte was F4 */
 /* Begin and End are the same state */
 #define	END BGN
 
@@ -123,7 +123,7 @@
 #define L4C (P4B << BGN)
 
 /* map an input byte to its byte category */
-const uint64 ByteCategory[256] =
+const uint32 ByteCategory[256] =
 {
 	/* ASCII */
 
@@ -181,7 +181,7 @@ const uint64 ByteCategory[256] =
 };
 
 static inline void
-utf8_advance(const unsigned char *s, uint64 *state, int len)
+utf8_advance(const unsigned char *s, uint32 *state, int len)
 {
 	/* Note: We deliberately don't check the state within the loop. */
 	while (len > 0)
@@ -216,7 +216,7 @@ int
 pg_validate_utf8_fallback(const unsigned char *s, int len)
 {
 	const int	orig_len = len;
-	uint64		state = BGN;
+	uint32		state = BGN;
 
 	while (len >= STRIDE_LENGTH)
 	{

dfa-pack-pg.pytext/x-python-script; charset=US-ASCII; name=dfa-pack-pg.pyDownload

#72

John Naylor

john.naylor@enterprisedb.com

over 4 years ago

In reply to: John Naylor (#71)

1 attachment(s)

Re: speed up verifying UTF-8

I wrote:

Naively, the shift-based DFA requires 64-bit integers to encode the

transitions, but I recently came across an idea from Dougall Johnson of
using the Z3 SMT solver to pack the transitions into 32-bit integers [1].
That halves the size of the transition table for free. I adapted that
effort to the existing conventions in v22 and arrived at the attached
python script.

[...]
I'll include something like the attached text file diff in the next

patch. Some comments are now outdated, but this is good enough for
demonstration.

Attached is v23 incorporating the 32-bit transition table, with the
necessary comment adjustments.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v23-0001-Add-fast-paths-for-validating-UTF-8-text.patchapplication/x-patch; name=v23-0001-Add-fast-paths-for-validating-UTF-8-text.patchDownload

From d62f21bf7256d283c83c81b07e49fc33e270e4e3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 28 Jul 2021 13:53:06 -0400
Subject: [PATCH v23] Add fast paths for validating UTF-8 text

Our previous validator is a traditional one that performs comparisons
and branching on one byte at a time. It's useful in that we always know
exactly how many bytes we have validated, but that precision comes at
a cost. Input validation can show up prominently in profiles of COPY
FROM, and future improvements to COPY FROM such as parallelism or
faster line parsing will put more pressure on input validation. Hence,
supplement with two fast paths, depending on platform:

On machines that support SSE4, use an algorithm described in the
paper "Validating UTF-8 In Less Than One Instruction Per Byte" by
John Keiser and Daniel Lemire. The authors have made available an
open source implementation within the simdjson library (Apache 2.0
license). The lookup tables and some naming conventions were adopted
from this library, but the code was written from scratch.

On other hardware, use a "shift-based" DFA.

Both implementations are highly optimized for blocks of ASCII text,
are relatively free of branches and thus robust against all kinds of
byte patterns, and delay error checking to the very end. With these
algorithms, UTF-8 validation is from anywhere from two to seven times
faster, depending on platform and the distribution of byte sequences
in the input.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 config/c-compiler.m4                     |  28 +-
 configure                                | 112 ++++++--
 configure.ac                             |  61 +++-
 src/Makefile.global.in                   |   3 +
 src/common/wchar.c                       |  35 +++
 src/include/mb/pg_wchar.h                |   7 +
 src/include/pg_config.h.in               |   9 +
 src/include/port/pg_sse42_utils.h        | 163 +++++++++++
 src/include/port/pg_utf8.h               | 103 +++++++
 src/port/Makefile                        |   6 +
 src/port/pg_utf8_fallback.c              | 231 +++++++++++++++
 src/port/pg_utf8_sse42.c                 | 349 +++++++++++++++++++++++
 src/port/pg_utf8_sse42_choose.c          |  68 +++++
 src/test/regress/expected/conversion.out | 170 +++++++++++
 src/test/regress/sql/conversion.sql      | 134 +++++++++
 src/tools/msvc/Mkvcbuild.pm              |   4 +
 src/tools/msvc/Solution.pm               |   3 +
 17 files changed, 1442 insertions(+), 44 deletions(-)
 create mode 100644 src/include/port/pg_sse42_utils.h
 create mode 100644 src/include/port/pg_utf8.h
 create mode 100644 src/port/pg_utf8_fallback.c
 create mode 100644 src/port/pg_utf8_sse42.c
 create mode 100644 src/port/pg_utf8_sse42_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..49d592a53c 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
   AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
 fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
 # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
+# Also, check for support of x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We might be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present if
+# SSE 4.2 intrinsics are.
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
   [unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+   return _mm_testz_si128(vec, vec);])],
   [Ac_cachevar=yes],
   [Ac_cachevar=no])
 CFLAGS="$pgac_save_CFLAGS"])
 if test x"$Ac_cachevar" = x"yes"; then
   CFLAGS_SSE42="$1"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 7542fe30a1..fec872fad6 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+PG_UTF8_OBJS
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -18079,14 +18080,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18100,32 +18101,35 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
+  pgac_cv_sse42_intrinsics_=yes
 else
-  pgac_cv_sse42_crc32_intrinsics_=no
+  pgac_cv_sse42_intrinsics_=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
   CFLAGS_SSE42=""
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
@@ -18139,26 +18143,29 @@ main ()
 unsigned int crc = 0;
    crc = _mm_crc32_u8(crc, 0);
    crc = _mm_crc32_u32(crc, 0);
+   __m128i vec = _mm_set1_epi8(crc);
+   vec = _mm_shuffle_epi8(vec,
+         _mm_alignr_epi8(vec, vec, 1));
    /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+   return _mm_testz_si128(vec, vec);
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_intrinsics__msse4_2=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_intrinsics__msse4_2=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
   CFLAGS_SSE42="-msse4.2"
-  pgac_sse42_crc32_intrinsics=yes
+  pgac_sse42_intrinsics=yes
 fi
 
 fi
@@ -18293,12 +18300,12 @@ fi
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -18312,7 +18319,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -18365,6 +18372,59 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+  else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+  fi
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index ed3cdb9a8e..cbd2d9a27b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2098,14 +2098,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
+# First check if these intrinsics can be used with the default
+# compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+  PGAC_SSE42_INTRINSICS([-msse4.2])
 fi
 AC_SUBST(CFLAGS_SSE42)
 
@@ -2146,12 +2146,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
 # in the template or configure command line.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
   # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
   else
     # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
     # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+    if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
       # Use ARM CRC Extension if available.
@@ -2165,7 +2165,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           # fall back to slicing-by-8 algorithm, which doesn't require any
           # special CPU support.
           USE_SLICING_BY_8_CRC32C=1
-	fi
+        fi
       fi
     fi
   fi
@@ -2202,6 +2202,49 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+  if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+    USE_SSE42_UTF8=1
+  else
+    if test x"$pgac_sse42_intrinsics" = x"yes"; then
+      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+    else
+      # fall back to algorithm which doesn't require any special
+      # CPU support.
+      USE_FALLBACK_UTF8=1
+    fi
+  fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_sse42.o"
+  AC_MSG_RESULT(SSE 4.2)
+else
+  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  else
+    AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use the fallback.])
+    PG_UTF8_OBJS="pg_utf8_fallback.o"
+    AC_MSG_RESULT(fallback)
+  fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 6e2f224cc4..a23441b36d 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -740,6 +740,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..586bfee7cc 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1761,7 +1762,41 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			non_error_bytes;
 
+	/*
+	 * For all but the shortest strings, dispatch to an optimized
+	 * platform-specific implementation in src/port. The threshold is set to
+	 * the width of the widest SIMD register we support in
+	 * src/include/port/pg_sse42_utils.h.
+	 */
+	if (len >= 16)
+	{
+		non_error_bytes = UTF8_VERIFYSTR_FAST(s, len);
+		s += non_error_bytes;
+		len -= non_error_bytes;
+
+		/*
+		 * The fast path is optimized for the valid case, so it's possible it
+		 * returned in the middle of a multibyte sequence, since that wouldn't
+		 * have raised an error. Before checking the remaining bytes, walk
+		 * backwards to find the last byte that could have been the start of a
+		 * valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) ||
+				IS_UTF8_2B_LEAD(*s) ||
+				IS_UTF8_3B_LEAD(*s) ||
+				IS_UTF8_4B_LEAD(*s))
+				break;
+		}
+	}
+
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..045bbbcb7e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -29,6 +29,13 @@ typedef unsigned int pg_wchar;
  */
 #define MAX_MULTIBYTE_CHAR_LEN	4
 
+/*
+ * UTF-8 macros
+ */
+#define IS_UTF8_2B_LEAD(c) (((c) & 0xe0) == 0xc0)
+#define IS_UTF8_3B_LEAD(c) (((c) & 0xf0) == 0xe0)
+#define IS_UTF8_4B_LEAD(c) (((c) & 0xf8) == 0xf0)
+
 /*
  * various definitions for EUC
  */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 15ffdd895a..58fd420831 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use the fallback. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -941,6 +944,12 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_sse42_utils.h b/src/include/port/pg_sse42_utils.h
new file mode 100644
index 0000000000..deafb3e5f8
--- /dev/null
+++ b/src/include/port/pg_sse42_utils.h
@@ -0,0 +1,163 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sse42_utils.h
+ *	  Convenience functions to wrap SSE 4.2 intrinsics.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_sse42_utils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SSE42_UTILS
+#define PG_SSE42_UTILS
+
+#include <nmmintrin.h>
+
+
+/* assign the arguments to the lanes in the register */
+#define vset(...)       _mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+	return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+	return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+	return _mm_set1_epi8(byte);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+	/*
+	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+	 * zero. Zero is the only value whose bitwise AND with itself is zero.
+	 */
+	return !_mm_testz_si128(v, v);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+	return _mm_movemask_epi8(v) != 0;
+}
+
+/* bitwise vector operations */
+
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+	return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+	return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+	return _mm_xor_si128(v1, v2);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+	return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+	*error = bitwise_or(*error, cmp);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+	return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift
+ * right in each 16-bit lane then apply a mask in each 8-bit
+ * lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+	const		__m128i shift16 = _mm_srli_epi16(v, n);
+	const		__m128i mask = splat(0xFF >> n);
+
+	return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+	return _mm_shuffle_epi8(lookup, input);
+}
+
+#endif							/* PG_SSE42_UTILS */
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..76b6ebf3f2
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,103 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use SSE 4.2 instructions. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_sse42((s), (len))
+
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/* Use SSE 4.2 instructions, but perform a runtime check first. */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
+extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+/* Use a portable implementation */
+#define UTF8_VERIFYSTR_FAST(s, len) \
+	pg_validate_utf8_fallback((s), (len))
+
+extern int	pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_UTF8 */
+
+/* The following are visible in all builds. */
+
+/*
+ * Verify a chunk of bytes for valid ASCII including a zero-byte check.
+ * This is here in case non-UTF8 encodings want to use it.
+ * WIP: Is there a better place for it?
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check if any high bits in the high bit accumulator got set. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/*
+	 * Check if any high bits in the zero accumulator got cleared.
+	 *
+	 * XXX: As noted above, the zero check is only valid if the chunk had no
+	 * high bits set. However, the compiler may perform these two checks in
+	 * any order. That's okay because if any high bits were set, we would
+	 * return false regardless, so invalid results from the zero check don't
+	 * matter.
+	 */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;
+
+	return true;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_UTF8_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..4291dd516e
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,231 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * This determines how much to advance the string pointer each time per loop.
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+
+/*
+ * The fallback UTF-8 validator uses a "shift-based" DFA as described by Per
+ * Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a traditional table-driven DFA, the input byte and current state are
+ * used to compute an index into an array of state transitions. Since the
+ * load address is dependent on earlier work, the CPU is not kept busy.
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * that encode the state transitions. To retrieve the current state, you
+ * simply right-shift the integer by the current state and apply a mask. In
+ * this scheme, loads only depend on the input byte, so there is better
+ * pipelining.
+ *
+ * In the most straigtforward implementation, a shift-based DFA for UTF-8
+ * requires 64-bit integers to encode the transitions, but with an SMT
+ * solver it's possible to find state numbers such that the transitions fit
+ * within 32-bit integers, as Dougall Johnson demonstrated:
+ *
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
+ *
+ * The naming convention for states and transitions was adopted from a
+ * UTF-8 to UTF-16/32 transcoder, which uses a traditional DFA:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *                                                                  |
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *                                                                  |
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
+ *                                                                  |
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
+ */
+
+/* Error */
+#define	ERR  0
+/* Begin */
+#define	BGN 11
+/* Continuation states, expect 1/2/3 continuation bytes */
+#define	CS1 16
+#define	CS2  1
+#define	CS3  5
+/* Leading byte was E0/ED, expect 1 more continuation byte */
+#define	P3A  6
+#define	P3B 20
+/* Leading byte was F0/F4, expect 2 more continuation bytes */
+#define	P4A 25
+#define	P4B 30
+/* Begin and End are the same state */
+#define	END BGN
+
+/* the encoded state transitions for the lookup table */
+
+/* ASCII */
+#define ASC (END << BGN)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* invalid byte */
+#define ILL ERR
+
+static const uint32 Utf8Transition[256] =
+{
+	/* ASCII */
+
+	ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static inline void
+utf8_advance(const unsigned char *s, uint32 *state, int len)
+{
+	/* Note: We deliberately don't check the state within the loop. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 31: In most instruction sets,
+		 * a shift by a 32-bit operand is understood to be a shift by its mod
+		 * 32, so the compiler should elide the mask operation.
+		 */
+		*state = Utf8Transition[*s++] >> (*state & 31);
+		len--;
+	}
+
+	*state &= 31;
+}
+
+/*
+ * Returns zero on error, or the string length if no errors were detected.
+ *
+ * In the error case, the caller must start over from the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const int	orig_len = len;
+	uint32		state = BGN;
+
+	while (len >= STRIDE_LENGTH)
+	{
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must first check for a non-END state, which means the previous
+		 * chunk ended in the middle of a multibyte sequence.
+		 */
+		if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+			utf8_advance(s, &state, STRIDE_LENGTH);
+
+		s += STRIDE_LENGTH;
+		len -= STRIDE_LENGTH;
+	}
+
+	/* check remaining bytes */
+	utf8_advance(s, &state, len);
+
+	/*
+	 * If we saw an error at any time, the final state will be error, in which
+	 * case we let the caller handle it. We treat all other states as success.
+	 */
+	if (state == ERR)
+		return 0;
+	else
+		return orig_len;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..8dd86911de
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,349 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ *	  Validate UTF-8 using SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_sse42_utils.h"
+#include "port/pg_utf8.h"
+
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire:
+ *
+ * https://arxiv.org/pdf/2010.03090.pdf
+ *
+ * The authors provide an implementation of this algorithm in the simdjson
+ * library (Apache 2.0 license):
+ *
+ * https://github.com/simdjson/simdjson
+ *
+ * The PG code was written from scratch, but with some naming conventions
+ * adapted from the Westmere implementation of simdjson. The constants and
+ * lookup tables were taken directly from simdjson with some cosmetic
+ * rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking at
+ * the first three nibbles of each overlapping 2-byte sequence, using three
+ * separate lookup tables. The interesting bytes are either definite errors
+ * or two continuation bytes in a row. The latter may be valid depending on
+ * what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+#define MAX_CONT 0xBF
+#define MAX_2B_LEAD 0xDF
+#define MAX_3B_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT		(1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG		(1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2		(1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3		(1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4		(1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000	(1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE		(1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE		(1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+	/* 0_______ ________ <ASCII in byte 1> */ \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+	/* 10______ ________ <continuation in byte 1> */ \
+	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+	/* 1100____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_2, \
+	/* 1101____ ________ <two byte lead in byte 1> */ \
+	TOO_SHORT, \
+	/* 1110____ ________ <three byte lead in byte 1> */ \
+	TOO_SHORT | OVERLONG_3 | SURROGATE, \
+	/* 1111____ ________ <four+ byte lead in byte 1> */ \
+	TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+	/* ____0000 ________ */ \
+	CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+	/* ____0001 ________ */ \
+	CARRY | OVERLONG_2, \
+	/* ____001_ ________ */ \
+	CARRY, \
+	CARRY, \
+	/* ____0100 ________ */ \
+	CARRY | TOO_LARGE, \
+	/* ____0101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____011_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1___ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	/* ____1101 ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+	/* ____111_ ________ */ \
+	CARRY | TOO_LARGE | TOO_LARGE_1000, \
+	CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+	/* ________ 0_______ <ASCII in byte 2> */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+	/* ________ 1000____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+	/* ________ 1001____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+	/* ________ 101_____ */ \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+	/* ________ 11______ */ \
+	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+	const		__m128i byte_1_hi_table = vset(BYTE_1_HIGH_TABLE);
+	const		__m128i byte_1_lo_table = vset(BYTE_1_LOW_TABLE);
+	const		__m128i byte_2_hi_table = vset(BYTE_2_HIGH_TABLE);
+
+	/*
+	 * To classify the first byte in each chunk we need to have the last byte
+	 * from the previous chunk.
+	 */
+	const		__m128i input_shift1 = prev1(prev, input);
+
+	/* put the relevant nibbles into their own bytes in their own registers */
+	const		__m128i byte_1_hi = shift_right(input_shift1, 4);
+	const		__m128i byte_1_lo = bitwise_and(input_shift1, splat(0x0F));
+	const		__m128i byte_2_hi = shift_right(input, 4);
+
+	/* lookup the possible errors for each set of nibbles */
+	const		__m128i lookup_1_hi = lookup(byte_1_hi, byte_1_hi_table);
+	const		__m128i lookup_1_lo = lookup(byte_1_lo, byte_1_lo_table);
+	const		__m128i lookup_2_hi = lookup(byte_2_hi, byte_2_hi_table);
+
+	/*
+	 * AND all the lookups together. At this point, non-zero lanes in the
+	 * returned vector represent:
+	 *
+	 * 1. invalid 2-byte sequences
+	 *
+	 * 2. the second continuation byte of a 3- or 4-byte character
+	 *
+	 * 3. the third continuation byte of a 4-byte character
+	 */
+	const		__m128i temp = bitwise_and(lookup_1_hi, lookup_1_lo);
+
+	return bitwise_and(temp, lookup_2_hi);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row, namely after 3- and 4-byte leads.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+	/*
+	 * Populate registers that contain the input shifted right by 2 and 3
+	 * bytes, filling in the left lanes with the previous input.
+	 */
+	const		__m128i input_shift2 = prev2(prev, input);
+	const		__m128i input_shift3 = prev3(prev, input);
+
+	/*
+	 * Constants for comparison. Any 3-byte lead is greater than
+	 * MAX_2B_LEAD, etc.
+	 */
+	const		__m128i max_lead2 = splat(MAX_2B_LEAD);
+	const		__m128i max_lead3 = splat(MAX_3B_LEAD);
+
+	/*
+	 * Look in the shifted registers for 3- or 4-byte leads. There is no
+	 * unsigned comparison, so we use saturating subtraction followed by
+	 * signed comparison with zero. Any non-zero bytes in the result represent
+	 * valid leads.
+	 */
+	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+	/* OR them together for easier comparison */
+	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+	/*
+	 * Set all bits in each 8-bit lane if the result is greater than zero.
+	 * Signed arithmetic is okay because the values are small.
+	 */
+	const		__m128i must23 = greater_than(temp, vzero());
+
+	/*
+	 * We want to compare with the result of check_special_cases() so apply a
+	 * mask to return only the set bits corresponding to the "two
+	 * continuations" case.
+	 */
+	return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+	const		__m128i special_cases = check_special_cases(prev, input);
+	const		__m128i set_two_conts = check_multibyte_lengths(prev, input);
+
+	/* If the two cases are identical, this will be zero. */
+	const		__m128i result = bitwise_xor(special_cases, set_two_conts);
+
+	*error = bitwise_or(*error, result);
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+	const		__m128i max_array =
+	vset(0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, 0xFF, 0xFF, 0xFF,
+		 0xFF, MAX_3B_LEAD, MAX_2B_LEAD, MAX_CONT);
+
+	return saturating_sub(v, max_array);
+}
+
+/*
+ * Returns zero on error, or the number of bytes processed if no errors were
+ * detected.
+ *
+ * In the error case, the caller must start over at the beginning and verify
+ * one byte at a time.
+ *
+ * In the non-error case, it's still possible we ended in the middle of an
+ * incomplete multibyte sequence, so the caller is responsible for adjusting
+ * the returned result to make sure it represents the end of the last valid
+ * byte sequence.
+ *
+ * See also the comment in common/wchar.c under "multibyte sequence
+ * validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	__m128i		error_cum = vzero();
+	__m128i		prev = vzero();
+	__m128i		prev_incomplete = vzero();
+	__m128i		input;
+
+	while (len >= sizeof(input))
+	{
+		input = vload(s);
+		check_for_zeros(input, &error_cum);
+
+		/*
+		 * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+		 * must still check the previous chunk for incomplete multibyte
+		 * sequences at the end. We only update prev_incomplete if the chunk
+		 * contains non-ASCII.
+		 */
+		if (is_highbit_set(input))
+		{
+			check_utf8_bytes(prev, input, &error_cum);
+			prev_incomplete = is_incomplete(input);
+		}
+		else
+			error_cum = bitwise_or(error_cum, prev_incomplete);
+
+		prev = input;
+		s += sizeof(input);
+		len -= sizeof(input);
+	}
+
+	/* If we saw an error during the loop, let the caller handle it. */
+	if (to_bool(error_cum))
+		return 0;
+	else
+		return s - start;
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..973fe69225
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports SSE 4.2.
+ * If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+
+	/*
+	 * XXX The equivalent check for CRC throws an error here because it
+	 * detects CPUID presence at configure time. This is to avoid indirecting
+	 * through a function pointer, but that's not important for UTF-8.
+	 */
+	return false;
+#endif							/* HAVE__GET_CPUID */
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		pg_validate_utf8 = pg_validate_utf8_sse42;
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int			(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..201876a495 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,176 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be the width of the widest SIMD
+-- register we could possibly support in the forseeable future.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..6510a88b1b 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,140 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be the width of the widest SIMD
+-- register we could possibly support in the forseeable future.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 3cb46832ab..638f784eb5 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -116,10 +116,14 @@ sub mkvcbuild
 		push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
 		push(@pgportfiles, 'pg_crc32c_sse42.c');
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+		push(@pgportfiles, 'pg_utf8_sse42.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 	else
 	{
 		push(@pgportfiles, 'pg_crc32c_sb8.c');
+		push(@pgportfiles, 'pg_utf8_fallback.c');
 	}
 
 	our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 165a93987a..ce04643151 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -491,6 +491,7 @@ sub GenerateFiles
 		USE_ASSERT_CHECKING => $self->{options}->{asserts} ? 1 : undef,
 		USE_BONJOUR         => undef,
 		USE_BSD_AUTH        => undef,
+		USE_FALLBACK_UTF8          => undef,
 		USE_ICU => $self->{options}->{icu} ? 1 : undef,
 		USE_LIBXML                 => undef,
 		USE_LIBXSLT                => undef,
@@ -503,6 +504,8 @@ sub GenerateFiles
 		USE_SLICING_BY_8_CRC32C    => undef,
 		USE_SSE42_CRC32C           => undef,
 		USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
+		USE_SSE42_UTF8             => undef,
+		USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
 		USE_SYSTEMD                         => undef,
 		USE_SYSV_SEMAPHORES                 => undef,
 		USE_SYSV_SHARED_MEMORY              => undef,
-- 
2.31.1

#73

Vladimir Sitnikov

sitnikov.vladimir@gmail.com

over 4 years ago

In reply to: John Naylor (#72)

Re: speed up verifying UTF-8

Attached is v23 incorporating the 32-bit transition table, with the

necessary comment adjustments

32bit table is nice.

Would you please replace
https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp URL
with
https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
in the header of src/port/pg_utf8_fallback.c?

It would make the URL more stable in case the file gets renamed.

Vladimir

#74

John Naylor

john.naylor@enterprisedb.com

about 4 years ago

In reply to: Vladimir Sitnikov (#73)

1 attachment(s)

Re: speed up verifying UTF-8

I've decided I'm not quite comfortable with the additional complexity in
the build system introduced by the SIMD portion of the previous patches. It
would make more sense if the pure C portion were unchanged, but with the
shift-based DFA plus the bitwise ASCII check, we have a portable
implementation that's still a substantial improvement over the current
validator. In v24, I've included only that much, and the diff is only about
1/3 as many lines. If future improvements to COPY FROM put additional
pressure on this path, we can always add SIMD support later.

One thing not in this patch is a possible improvement to
pg_utf8_verifychar() that Heikki and I worked on upthread as part of
earlier attempts to rewrite pg_utf8_verifystr(). That's worth looking into
separately.

On Thu, Aug 26, 2021 at 12:09 PM Vladimir Sitnikov <
sitnikov.vladimir@gmail.com> wrote:

Attached is v23 incorporating the 32-bit transition table, with the

necessary comment adjustments

32bit table is nice.

Thanks for taking a look!

Would you please replace

https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.cpp URL
with

https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp

in the header of src/port/pg_utf8_fallback.c?

It would make the URL more stable in case the file gets renamed.

Vladimir

Makes sense, so done that way.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v24-0001-Add-fast-path-for-validating-UTF-8-text.patchapplication/octet-stream; name=v24-0001-Add-fast-path-for-validating-UTF-8-text.patchDownload

From 3429e06a7cace09d04af5ce1616c976d93eba5f3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 19 Oct 2021 16:43:14 -0400
Subject: [PATCH v24] Add fast path for validating UTF-8 text

Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:

Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 src/common/wchar.c                       | 276 ++++++++++++++++++++++-
 src/include/mb/pg_wchar.h                |   7 +
 src/test/regress/expected/conversion.out | 169 ++++++++++++++
 src/test/regress/sql/conversion.sql      | 133 +++++++++++
 4 files changed, 584 insertions(+), 1 deletion(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index a6bffd0642..bd3c756ab0 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -41,6 +41,62 @@
  * subset to the ASCII routines to ensure consistency.
  */
 
+/*
+ * Verify a chunk of bytes for valid ASCII, including a zero-byte check.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check if any high bits in the high bit accumulator got set. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/*
+	 * Check if any high bits in the zero accumulator got cleared.
+	 *
+	 * XXX: As noted above, the zero check is only valid if the chunk had no
+	 * high bits set. However, the compiler may perform these two checks in
+	 * any order. That's okay because if any high bits were set, we would
+	 * return false regardless, so invalid results from the zero check don't
+	 * matter.
+	 */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;
+
+	return true;
+}
+
 /*
  * SQL/ASCII
  */
@@ -1721,7 +1777,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
-static int
+static pg_noinline int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
 	int			l;
@@ -1750,11 +1806,229 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/*
+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
+ * input byte and current state are used to compute an index into an array of
+ * state transitions. Since the address of the next transition is dependent
+ * on this computation, there is latency in executing the load instruction,
+ * and the CPU is not kept busy.
+ *
+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the current
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.
+ *
+ * The naming convention for states and transitions was adopted from a UTF-8
+ * to UTF-16/32 transcoder, whose table is reproduced below:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *                                                                  |
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *                                                                  |
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
+ *                                                                  |
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
+ *
+ * In the most straightforward implementation, a shift-based DFA for UTF-8
+ * requires 64-bit integers to encode the transitions, but with an SMT solver
+ * it's possible to find state numbers such that the transitions fit within
+ * 32-bit integers, as Dougall Johnson demonstrated:
+ *
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
+ *
+ * This packed representation is the reason for the seemingly odd choice of
+ * state values below.
+ */
+
+/* Error */
+#define	ERR  0
+/* Begin */
+#define	BGN 11
+/* Continuation states, expect 1/2/3 continuation bytes */
+#define	CS1 16
+#define	CS2  1
+#define	CS3  5
+/* Leading byte was E0/ED, expect 1 more continuation byte */
+#define	P3A  6
+#define	P3B 20
+/* Leading byte was F0/F4, expect 2 more continuation bytes */
+#define	P4A 25
+#define	P4B 30
+/* Begin and End are the same state */
+#define	END BGN
+
+/* the encoded state transitions for the lookup table */
+
+/* ASCII */
+#define ASC (END << BGN)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* invalid byte */
+#define ILL ERR
+
+static const uint32 Utf8Transition[256] =
+{
+	/* ASCII */
+
+	ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static inline void
+utf8_advance(const unsigned char *s, uint32 *state, int len)
+{
+	/* Note: We deliberately don't check the state's value here. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 31: In most instruction sets,
+		 * a shift by a 32-bit operand is understood to be a shift by its mod
+		 * 32, so the compiler should elide the mask operation.
+		 */
+		*state = Utf8Transition[*s++] >> (*state & 31);
+		len--;
+	}
+
+	*state &= 31;
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	const int	orig_len = len;
+	uint32		state = BGN;
+
+/*
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+	if (len >= STRIDE_LENGTH)
+	{
+		while (len >= STRIDE_LENGTH)
+		{
+			/*
+			 * If the chunk is all ASCII, we can skip the full UTF-8 check,
+			 * but we must first check for a non-END state, which means the
+			 * previous chunk ended in the middle of a multibyte sequence.
+			 */
+			if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+				utf8_advance(s, &state, STRIDE_LENGTH);
+
+			s += STRIDE_LENGTH;
+			len -= STRIDE_LENGTH;
+		}
+
+		/*
+		 * The error state persists, so we only need to check for it here. In
+		 * case of error we start over from the beginning with the slow path
+		 * so we can count the valid bytes.
+		 */
+		if (state == ERR)
+		{
+			len = orig_len;
+			s = start;
+		}
+
+		/*
+		 * We treat all other states as success, but it's possible the fast
+		 * path exited in the middle of a multibyte sequence, since that
+		 * wouldn't have caused an error. Before checking the remaining bytes,
+		 * walk backwards to find the last byte that could have been the start
+		 * of a valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) ||
+				IS_UTF8_2B_LEAD(*s) ||
+				IS_UTF8_3B_LEAD(*s) ||
+				IS_UTF8_4B_LEAD(*s))
+				break;
+		}
+	}
 
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..045bbbcb7e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -29,6 +29,13 @@ typedef unsigned int pg_wchar;
  */
 #define MAX_MULTIBYTE_CHAR_LEN	4
 
+/*
+ * UTF-8 macros
+ */
+#define IS_UTF8_2B_LEAD(c) (((c) & 0xe0) == 0xc0)
+#define IS_UTF8_3B_LEAD(c) (((c) & 0xf0) == 0xe0)
+#define IS_UTF8_4B_LEAD(c) (((c) & 0xf8) == 0xf0)
+
 /*
  * various definitions for EUC
  */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 5c9d631755..f8a64f616e 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,175 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 5576999e42..e178e2479b 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,139 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

#75

John Naylor

john.naylor@enterprisedb.com

about 4 years ago

In reply to: John Naylor (#74)

Re: speed up verifying UTF-8

It occurred to me that the DFA + ascii quick check approach could also
be adapted to speed up some cases where we currently walk a string
counting characters, like this snippet in
text_position_get_match_pos():

/* Convert the byte position to char position. */
while (state->refpoint < state->last_match)
{
state->refpoint += pg_mblen(state->refpoint);
state->refpos++;
}

This coding changed in 9556aa01c69 (Use single-byte
Boyer-Moore-Horspool search even with multibyte encodings), in which I
found the majority of cases were faster, but some were slower. It
would be nice to regain the speed lost and do even better.

In the case of UTF-8, we could just run it through the DFA,
incrementing a count of the states found. The number of END states
should be the number of characters. The ascii quick check would still
be applicable as well. I think all that is needed is to export some
symbols and add the counting function. That wouldn't materially affect
the current patch for input verification, and would be separate, but
it would be nice to get the symbol visibility right up front. I've set
this to waiting on author while I experiment with that.

--
John Naylor
EDB: http://www.enterprisedb.com

#76

Heikki Linnakangas

hlinnaka@iki.fi

about 4 years ago

In reply to: John Naylor (#74)

Re: speed up verifying UTF-8

On 20/10/2021 00:42, John Naylor wrote:

I've decided I'm not quite comfortable with the additional complexity in
the build system introduced by the SIMD portion of the previous patches.
It would make more sense if the pure C portion were unchanged, but with
the shift-based DFA plus the bitwise ASCII check, we have a portable
implementation that's still a substantial improvement over the current
validator. In v24, I've included only that much, and the diff is only
about 1/3 as many lines. If future improvements to COPY FROM put
additional pressure on this path, we can always add SIMD support later.

+1.

I had another look at this now. Looks good, just a few minor comments below:

+/*
+ * Verify a chunk of bytes for valid ASCII, including a zero-byte check.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}

This function assumes that the input len is a multiple of 8. There's an
assertion for that, but it would be good to also mention it in the
function comment. I took me a moment to realize that.

Given that assumption, I wonder if "while (len >= 0)" would marginally
faster. Or compute "s_end = s + len" first, and check for "while (s <
s_end)", so that you don't need to update 'len' in the loop.

Also would be good to mention what exactly the return value means. I.e
"returns false if the input contains any bytes with the high-bit set, or
zeros".

+	/*
+	 * Check if any high bits in the zero accumulator got cleared.
+	 *
+	 * XXX: As noted above, the zero check is only valid if the chunk had no
+	 * high bits set. However, the compiler may perform these two checks in
+	 * any order. That's okay because if any high bits were set, we would
+	 * return false regardless, so invalid results from the zero check don't
+	 * matter.
+	 */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;

I don't understand the "the compiler may perform these checks in any
order" comment. We trust the compiler to do the right thing, and only
reorder things when it's safe to do so. What is special here, why is it
worth mentioning here?

@@ -1721,7 +1777,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
return s - start;
}

-static int
+static pg_noinline int
pg_utf8_verifychar(const unsigned char *s, int len)
{
int l;

Why force it to not be inlined?

+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the current
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.

Should be "To compute the *next* state, ...", I think.

The way the state transition table works is pretty inscrutable. That's
understandable, because the values were found by an SMT solver, so I'm
not sure if anything can be done about it.

- Heikki

#77

Godfrin, Philippe E

Philippe.Godfrin@nov.com

about 4 years ago

In reply to: Heikki Linnakangas (#76)

RE: [EXTERNAL] Re: speed up verifying UTF-8

-----Original Message-----
From: Heikki Linnakangas <hlinnaka@iki.fi>
Sent: Friday, December 10, 2021 12:34 PM
To: John Naylor <john.naylor@enterprisedb.com>; Vladimir Sitnikov <sitnikov.vladimir@gmail.com>
Cc: pgsql-hackers <pgsql-hackers@postgresql.org>; Amit Khandekar <amitdkhan.pg@gmail.com>; Thomas Munro <thomas.munro@gmail.com>; Greg Stark <stark@mit.edu>
Subject: [EXTERNAL] Re: speed up verifying UTF-8

On 20/10/2021 00:42, John Naylor wrote:

I've decided I'm not quite comfortable with the additional complexity
in the build system introduced by the SIMD portion of the previous patches.
It would make more sense if the pure C portion were unchanged, but
with the shift-based DFA plus the bitwise ASCII check, we have a
portable implementation that's still a substantial improvement over
the current validator. In v24, I've included only that much, and the
diff is only about 1/3 as many lines. If future improvements to COPY
FROM put additional pressure on this path, we can always add SIMD support later.

+1.

I had another look at this now. Looks good, just a few minor comments below:
+/*
+ * Verify a chunk of bytes for valid ASCII, including a zero-byte check.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len) {
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len >= sizeof(chunk))
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We will check later that none of the bytes in
+		 * the chunk had the high bit set, in which case the max value each
+		 * byte can have after the addition is 0x7f + 0x7f = 0xfe, and we
+		 * don't need to worry about carrying over to the next byte.
+		 *
+		 * If any resulting high bits are zero, the corresponding high bits in
+		 * the zero accumulator will be cleared.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
This function assumes that the input len is a multiple of 8. There's an assertion for that, but it would be good to also mention it in the function comment. I took me a moment to realize that.

Given that assumption, I wonder if "while (len >= 0)" would marginally faster. Or compute "s_end = s + len" first, and check for "while (s < s_end)", so that you don't need to update 'len' in the loop.

Also would be good to mention what exactly the return value means. I.e "returns false if the input contains any bytes with the high-bit set, or zeros".
+	/*
+	 * Check if any high bits in the zero accumulator got cleared.
+	 *
+	 * XXX: As noted above, the zero check is only valid if the chunk had no
+	 * high bits set. However, the compiler may perform these two checks in
+	 * any order. That's okay because if any high bits were set, we would
+	 * return false regardless, so invalid results from the zero check don't
+	 * matter.
+	 */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;
I don't understand the "the compiler may perform these checks in any order" comment. We trust the compiler to do the right thing, and only reorder things when it's safe to do so. What is special here, why is it worth mentioning here?
@@ -1721,7 +1777,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
return s - start;
}
-static int
+static pg_noinline int
pg_utf8_verifychar(const unsigned char *s, int len)  {
int			l;
Why force it to not be inlined?
+ * In a shift-based DFA, the input byte is an index into array of 
+ integers
+ * whose bit pattern encodes the state transitions. To compute the 
+ current
+ * state, we simply right-shift the integer by the current state and 
+ apply a
+ * mask. In this scheme, the address of the transition only depends 
+ on the
+ * input byte, so there is better pipelining.
Should be "To compute the *next* state, ...", I think.

The way the state transition table works is pretty inscrutable. That's understandable, because the values were found by an SMT solver, so I'm not sure if anything can be done about it.

- Heikki

If I remember correctly the shift instruction is very fast...

#78

John Naylor

john.naylor@enterprisedb.com

about 4 years ago

In reply to: Heikki Linnakangas (#76)

1 attachment(s)

Re: speed up verifying UTF-8

On Fri, Dec 10, 2021 at 2:33 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I had another look at this now. Looks good, just a few minor comments below:

Thanks for reviewing! I've attached v25 to address your points.

This function assumes that the input len is a multiple of 8. There's an
assertion for that, but it would be good to also mention it in the
function comment. I took me a moment to realize that.

Done.

Given that assumption, I wonder if "while (len >= 0)" would marginally
faster. Or compute "s_end = s + len" first, and check for "while (s <
s_end)", so that you don't need to update 'len' in the loop.

With two chunks, gcc 4.8.5/11.2 and clang 12 will unroll the inner
loop, so it doesn't matter:

L51:
mov rdx, QWORD PTR [rdi]
mov rsi, QWORD PTR [rdi+8]
lea rax, [rdx+rbx]
lea rbp, [rsi+rbx]
and rax, rbp
and rax, r11
cmp rax, r11
jne .L66
or rdx, rsi
test rdx, r11
jne .L66
sub r8d, 16 ; refers to "len" in the caller
pg_utf8_verifystr()
add rdi, 16
cmp r8d, 15
jg .L51

I *think* these are the same instructions as from your version from
some time ago that handled two integers explicitly -- I rewrote it
like this to test different chunk sizes.

(Aside on 32-byte strides: Four chunks was within the noise level of
two chunks on the platform I tested. With 32 bytes, that increases the
chance that a mixed input would have non-ascii and defeat this
optimization, so should be significantly faster to make up for that.
Along those lines, in the future we could consider SSE2 (unrolled 2 x
16 bytes) for this path. Since it's part of the spec for x86-64, we
wouldn't need a runtime check -- just #ifdef it inline. And we could
piggy-back on the CRC SSE4.2 configure test for intrinsic support, so
that would avoid adding a bunch of complexity.)

That said, I think your suggestions are better on code clarity
grounds. I'm on the fence about "while(s < s_end)", so I went with
"while (len > 0)" because it matches the style in wchar.c.

Also would be good to mention what exactly the return value means. I.e
"returns false if the input contains any bytes with the high-bit set, or
zeros".

Done.

+     /*
+      * Check if any high bits in the zero accumulator got cleared.
+      *
+      * XXX: As noted above, the zero check is only valid if the chunk had no
+      * high bits set. However, the compiler may perform these two checks in
+      * any order. That's okay because if any high bits were set, we would
+      * return false regardless, so invalid results from the zero check don't
+      * matter.
+      */
+     if (zero_cum != UINT64CONST(0x8080808080808080))
+             return false;

I don't understand the "the compiler may perform these checks in any
order" comment. We trust the compiler to do the right thing, and only
reorder things when it's safe to do so. What is special here, why is it
worth mentioning here?

Ah, that's a good question, and now that you mention it, the comment
is silly. When looking at the assembly output a while back, I was a
bit astonished that it didn't match my mental model of what was
happening, so I made this note. I've removed the whole XXX comment
here and expanded the first comment in the loop to:

/*
* Capture any zero bytes in this chunk.
*
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. If any resulting high bits are zero, the
* corresponding high bits in the zero accumulator will be cleared.
*
* If none of the bytes in the chunk had the high bit set, the max
* value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
* and we don't need to worry about carrying over to the next byte. If
* any input bytes did have the high bit set, it doesn't matter
* because we check for those separately.
*/

@@ -1721,7 +1777,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
return s - start;
}

-static int
+static pg_noinline int
pg_utf8_verifychar(const unsigned char *s, int len)
{
int l;

Why force it to not be inlined?

Since the only direct caller is now only using it for small inputs, I
thought about saving space, but it's not enough to matter, so I'll go
ahead and leave it out. While at it, I removed the unnecessary
"inline" declaration for utf8_advance(), since the compiler can do
that anyway.

+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the current
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.

Should be "To compute the *next* state, ...", I think.

Fixed.

The way the state transition table works is pretty inscrutable. That's
understandable, because the values were found by an SMT solver, so I'm
not sure if anything can be done about it.

Do you mean in general, or just the state values?

Like any state machine, the code is simple, and the complexity is
hidden in the data. Hopefully the first link I included in the comment
is helpful.

The SMT solver was only needed to allow 32-bit (instead of 64-bit)
entries in the transition table, so it's not strictly necessary. A
lookup table that fits in 1kB is nice from a cache perspective,
however.

With 64-bit, the state values are less weird-looking but they're still
just arbitrary numbers. As long as ERR = 0 and the largest is at most
9, it doesn't matter what they are, so I'm not sure it's much less
mysterious. You can see the difference between 32-bit and 64-bit in
[1]: /messages/by-id/attachment/125672/v22-addendum-32-bit-transitions.txt

--
In addition to Heikki's. review points, I've made a couple small
additional changes from v24: I rewrote this part, so we don't need
these macros anymore:

-                       if (!IS_HIGHBIT_SET(*s) ||
-                               IS_UTF8_2B_LEAD(*s) ||
-                               IS_UTF8_3B_LEAD(*s) ||
-                               IS_UTF8_4B_LEAD(*s))
+                       if (!IS_HIGHBIT_SET(*s) || pg_utf_mblen(s) > 1)

And I moved is_valid_ascii() to pg_wchar.h so it can be used
elsewhere. I'm not sure there's a better place to put it. I tried
using this for text_position(), for which I'll start a new thread.

[1]: /messages/by-id/attachment/125672/v22-addendum-32-bit-transitions.txt

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v25-0001-Add-fast-path-for-validating-UTF-8-text.patchapplication/x-patch; name=v25-0001-Add-fast-path-for-validating-UTF-8-text.patchDownload

From 0c20e37ac47a5759e6ec674bb4184d835c562af8 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 19 Oct 2021 16:43:14 -0400
Subject: [PATCH v25] Add fast path for validating UTF-8 text

Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:

Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.

The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.

Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark

Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
---
 src/common/wchar.c                       | 215 +++++++++++++++++++++++
 src/include/mb/pg_wchar.h                |  53 ++++++
 src/test/regress/expected/conversion.out | 169 ++++++++++++++++++
 src/test/regress/sql/conversion.sql      | 133 ++++++++++++++
 4 files changed, 570 insertions(+)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index a6bffd0642..be931c5e92 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1750,11 +1750,226 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/*
+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
+ * input byte and current state are used to compute an index into an array of
+ * state transitions. Since the address of the next transition is dependent
+ * on this computation, there is latency in executing the load instruction,
+ * and the CPU is not kept busy.
+ *
+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the next
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.
+ *
+ * The naming convention for states and transitions was adopted from a UTF-8
+ * to UTF-16/32 transcoder, whose table is reproduced below:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *                                                                  |
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *                                                                  |
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
+ *                                                                  |
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
+ *
+ * In the most straightforward implementation, a shift-based DFA for UTF-8
+ * requires 64-bit integers to encode the transitions, but with an SMT solver
+ * it's possible to find state numbers such that the transitions fit within
+ * 32-bit integers, as Dougall Johnson demonstrated:
+ *
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
+ *
+ * This packed representation is the reason for the seemingly odd choice of
+ * state values below.
+ */
+
+/* Error */
+#define	ERR  0
+/* Begin */
+#define	BGN 11
+/* Continuation states, expect 1/2/3 continuation bytes */
+#define	CS1 16
+#define	CS2  1
+#define	CS3  5
+/* Leading byte was E0/ED, expect 1 more continuation byte */
+#define	P3A  6
+#define	P3B 20
+/* Leading byte was F0/F4, expect 2 more continuation bytes */
+#define	P4A 25
+#define	P4B 30
+/* Begin and End are the same state */
+#define	END BGN
+
+/* the encoded state transitions for the lookup table */
+
+/* ASCII */
+#define ASC (END << BGN)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* invalid byte */
+#define ILL ERR
+
+static const uint32 Utf8Transition[256] =
+{
+	/* ASCII */
+
+	ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+	ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+	CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+	/* 90..9F */
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+	CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+	/* A0..BF */
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+	CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+	/* leading bytes */
+
+	/* C0..DF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static void
+utf8_advance(const unsigned char *s, uint32 *state, int len)
+{
+	/* Note: We deliberately don't check the state's value here. */
+	while (len > 0)
+	{
+		/*
+		 * It's important that the mask value is 31: In most instruction sets,
+		 * a shift by a 32-bit operand is understood to be a shift by its mod
+		 * 32, so the compiler should elide the mask operation.
+		 */
+		*state = Utf8Transition[*s++] >> (*state & 31);
+		len--;
+	}
+
+	*state &= 31;
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	const int	orig_len = len;
+	uint32		state = BGN;
+
+/*
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+	if (len >= STRIDE_LENGTH)
+	{
+		while (len >= STRIDE_LENGTH)
+		{
+			/*
+			 * If the chunk is all ASCII, we can skip the full UTF-8 check,
+			 * but we must first check for a non-END state, which means the
+			 * previous chunk ended in the middle of a multibyte sequence.
+			 */
+			if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+				utf8_advance(s, &state, STRIDE_LENGTH);
+
+			s += STRIDE_LENGTH;
+			len -= STRIDE_LENGTH;
+		}
+
+		/*
+		 * The error state persists, so we only need to check for it here. In
+		 * case of error we start over from the beginning with the slow path
+		 * so we can count the valid bytes.
+		 */
+		if (state == ERR)
+		{
+			len = orig_len;
+			s = start;
+		}
+
+		/*
+		 * We treat all other states as success, but it's possible the fast
+		 * path exited in the middle of a multibyte sequence, since that
+		 * wouldn't have caused an error. Before checking the remaining bytes,
+		 * walk backwards to find the last byte that could have been the start
+		 * of a valid sequence.
+		 */
+		while (s > start)
+		{
+			s--;
+			len++;
+
+			if (!IS_HIGHBIT_SET(*s) || pg_utf_mblen(s) > 1)
+				break;
+		}
+	}
 
+	/* check remaining bytes */
 	while (len > 0)
 	{
 		int			l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..6bd996b3d0 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -699,4 +699,57 @@ extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 #endif
 
+
+/*
+ * Verify a chunk of bytes for valid ASCII.
+ *
+ * Returns false if the input contains any zero bytes or bytes with the
+ * high-bit set. Input len must be a multiple of 8.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	uint64		chunk,
+				highbit_cum = UINT64CONST(0),
+				zero_cum = UINT64CONST(0x8080808080808080);
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len > 0)
+	{
+		memcpy(&chunk, s, sizeof(chunk));
+
+		/*
+		 * Capture any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. If any resulting high bits are zero, the
+		 * corresponding high bits in the zero accumulator will be cleared.
+		 *
+		 * If none of the bytes in the chunk had the high bit set, the max
+		 * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
+		 * and we don't need to worry about carrying over to the next byte. If
+		 * any input bytes did have the high bit set, it doesn't matter
+		 * because we check for those separately.
+		 */
+		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+		/* Capture any set bits in this chunk. */
+		highbit_cum |= chunk;
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check if any high bits in the high bit accumulator got set. */
+	if (highbit_cum & UINT64CONST(0x8080808080808080))
+		return false;
+
+	/* Check if any high bits in the zero accumulator got cleared. */
+	if (zero_cum != UINT64CONST(0x8080808080808080))
+		return false;
+
+	return true;
+}
+
 #endif							/* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 5c9d631755..f8a64f616e 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,175 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 5576999e42..e178e2479b 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,139 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f',	'NUL byte'),
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte');
+
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

#79

John Naylor

john.naylor@enterprisedb.com

about 4 years ago

In reply to: John Naylor (#78)

Re: speed up verifying UTF-8

I plan to push v25 early next week, unless there are further comments.

--
John Naylor
EDB: http://www.enterprisedb.com

#80

John Naylor

john.naylor@enterprisedb.com

about 4 years ago