From 9e58fe0265d3ba5a52b1193a7ef10d734449a4b1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 11 Mar 2025 10:09:23 -0700
Subject: [PATCH v3j 1/2] Refactor convert_case to prepare for optimizations.

---
 src/common/unicode_case.c | 145 ++++++++++++++++++++++++--------------
 1 file changed, 93 insertions(+), 52 deletions(-)

diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 7afff1b172b..92323be9cd3 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -20,12 +20,20 @@
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 
+enum CaseMapResult
+{
+	CASEMAP_SELF,
+	CASEMAP_SIMPLE,
+	CASEMAP_SPECIAL,
+};
+
 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
-static bool check_special_conditions(int conditions, const char *str,
-									 size_t len, size_t offset);
+enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+						   const char *src, size_t srclen, size_t srcoff,
+						   pg_wchar *u2, const pg_wchar **special);
 
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	{
 		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
-		const pg_case_map *casemap = find_case_map(u1);
-		const pg_special_case *special = NULL;
+		pg_wchar	simple = 0;
+		const pg_wchar *special = NULL;
+		enum CaseMapResult casemap_result;
 
 		if (str_casekind == CaseTitle)
 		{
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				chr_casekind = CaseLower;
 		}
 
-		/*
-		 * Find special case that matches the conditions, if any.
-		 *
-		 * Note: only a single special mapping per codepoint is currently
-		 * supported, though Unicode allows for multiple special mappings for
-		 * a single codepoint.
-		 */
-		if (full && casemap && casemap->special_case)
-		{
-			int16		conditions = casemap->special_case->conditions;
-
-			Assert(casemap->special_case->codepoint == u1);
-			if (check_special_conditions(conditions, src, srclen, srcoff))
-				special = casemap->special_case;
-		}
+		casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
+								 &simple, &special);
 
-		/* perform mapping, update result_len, and write to dst */
-		if (special)
+		switch (casemap_result)
 		{
-			for (int i = 0; i < MAX_CASE_EXPANSION; i++)
-			{
-				pg_wchar	u2 = special->map[chr_casekind][i];
-				size_t		u2len = unicode_utf8len(u2);
-
-				if (u2 == '\0')
-					break;
-
-				if (result_len + u2len <= dstsize)
-					unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-				result_len += u2len;
-			}
-		}
-		else if (casemap)
-		{
-			pg_wchar	u2 = casemap->simplemap[chr_casekind];
-			pg_wchar	u2len = unicode_utf8len(u2);
-
-			if (result_len + u2len <= dstsize)
-				unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-			result_len += u2len;
-		}
-		else
-		{
-			/* no mapping; copy bytes from src */
-			if (result_len + u1len <= dstsize)
-				memcpy(dst + result_len, src + srcoff, u1len);
-
-			result_len += u1len;
+			case CASEMAP_SELF:
+				/* no mapping; copy bytes from src */
+				Assert(simple == 0);
+				Assert(special == NULL);
+				if (result_len + u1len <= dstsize)
+					memcpy(dst + result_len, src + srcoff, u1len);
+
+				result_len += u1len;
+				break;
+			case CASEMAP_SIMPLE:
+				{
+					/* replace with single character */
+					pg_wchar	u2 = simple;
+					pg_wchar	u2len = unicode_utf8len(u2);
+
+					Assert(special == NULL);
+					if (result_len + u2len <= dstsize)
+						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+					result_len += u2len;
+				}
+				break;
+			case CASEMAP_SPECIAL:
+				/* replace with up to MAX_CASE_EXPANSION characters */
+				Assert(simple == 0);
+				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
+				{
+					pg_wchar	u2 = special[i];
+					size_t		u2len = unicode_utf8len(u2);
+
+					if (result_len + u2len <= dstsize)
+						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+					result_len += u2len;
+				}
+				break;
 		}
 
 		srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	return true;
 }
 
+/*
+ * Unicode allows for special casing to be applied only under certain
+ * circumstances. The only currently-supported condition is Final_Sigma.
+ */
 static bool
 check_special_conditions(int conditions, const char *str, size_t len,
 						 size_t offset)
@@ -365,6 +369,43 @@ check_special_conditions(int conditions, const char *str, size_t len,
 	return false;
 }
 
+/*
+ * Map the given character to the requested case. If full is true, use special
+ * casing if a mapping exists and the conditions are satisfied.
+ *
+ * If full is true, and a special case mapping is found and the conditions are
+ * met, 'special' is set to the mapping result (which is an array of up to
+ * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
+ *
+ * Otherwise, search for a simple mapping, and if found, set 'simple' to the
+ * result and return CASEMAP_SIMPLE.
+ *
+ * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
+ * character without modification.
+ */
+enum CaseMapResult
+casemap(pg_wchar u1, CaseKind casekind, bool full,
+		const char *src, size_t srclen, size_t srcoff,
+		pg_wchar *simple, const pg_wchar **special)
+{
+	const pg_case_map *map = find_case_map(u1);
+
+	if (map == NULL)
+		return CASEMAP_SELF;
+
+	if (full && map->special_case != NULL &&
+		check_special_conditions(map->special_case->conditions,
+								 src, srclen, srcoff))
+	{
+		*special = map->special_case->map[casekind];
+		return CASEMAP_SPECIAL;
+	}
+
+	*simple = map->simplemap[casekind];
+
+	return CASEMAP_SIMPLE;
+}
+
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)
-- 
2.34.1

