From ccacdfe30614f10a79038df36fab228428335fe1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 15 Dec 2020 11:12:45 +0200
Subject: [PATCH 2/5] Replace pg_utf8_verifystr() with a faster implementation.

This inlines the pg_utf8_verifychar() function into the loop. We could do
a lot more - there are much faster SIMD and lookup table based algorithms
out there - but I'll leave that for another patch.

In the passing, remove remnants of support for 5- and 6-byte UTF-8
characters. They were considered in very early Unicode versions, but the
current Unicode standard limits the number of code points to 17 planes
which are representable in 4 bytes in UTF-8, and there are no plans to ever
go beyond that.
---
 src/common/wchar.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 5ab29bcbc39..403974629f7 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -558,12 +558,6 @@ pg_utf_mblen(const unsigned char *s)
 		len = 3;
 	else if ((*s & 0xf8) == 0xf0)
 		len = 4;
-#ifdef NOT_USED
-	else if ((*s & 0xfc) == 0xf8)
-		len = 5;
-	else if ((*s & 0xfe) == 0xfc)
-		len = 6;
-#endif
 	else
 		len = 1;
 	return len;
@@ -1764,28 +1758,37 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	const unsigned char *end = s + len;
 
-	while (len > 0)
+	while (s < end)
 	{
-		int			l;
+		int                     l;
 
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
+		if ((*s & 0x80) == 0)
 		{
 			if (*s == '\0')
 				break;
-			l = 1;
+
+			s++;
+			continue;
 		}
+		else if ((*s & 0xe0) == 0xc0)
+			l = 2;
+		else if ((*s & 0xf0) == 0xe0)
+			l = 3;
+		else if ((*s & 0xf8) == 0xf0)
+			l = 4;
 		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
+			l = 1;
+
+		if (s + l > end)
+			break;
+
+		if (!pg_utf8_islegal(s, l))
+			break;
+
 		s += l;
-		len -= l;
 	}
-
 	return s - start;
 }
 
@@ -1810,9 +1813,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
 
 	switch (length)
 	{
-		default:
-			/* reject lengths 5 and 6 for now */
-			return false;
 		case 4:
 			a = source[3];
 			if (a < 0x80 || a > 0xBF)
-- 
2.20.1

