From 5a355ef083cc7de92ae1e5dcc0198866a07919eb Mon Sep 17 00:00:00 2001
From: Andreas Karlsson <andreas@proxel.se>
Date: Tue, 17 Dec 2024 22:47:00 +0100
Subject: [PATCH v1 1/2] Use optimized versions of ICU case conversion for
 UTF-8

Instead of converting to and from UChar when doing case conversions we
use the UTF-8 versions of the functions. This can give a signficant
speedup, 15-20%, on short to medium length strings.
---
 src/backend/utils/adt/pg_locale_icu.c | 161 ++++++++++++++++++--------
 1 file changed, 114 insertions(+), 47 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index f0a77a767e7..eea6f48f6c3 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -12,6 +12,7 @@
 #include "postgres.h"
 
 #ifdef USE_ICU
+#include "unicode/ucasemap.h"
 #include <unicode/ucnv.h>
 #include <unicode/ustring.h>
 
@@ -100,9 +101,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
 							 const UChar *buff_uchar, int32_t len_uchar);
 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
 										 UErrorCode *status);
-static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
-								UChar **buff_dest, UChar *buff_source,
-								int32_t len_source);
+static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
+									  UChar **buff_dest, UChar *buff_source,
+									  int32_t len_source);
 static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
 									   const UChar *src, int32_t srcLength,
 									   const char *locale,
@@ -350,60 +351,126 @@ size_t
 strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			 pg_locale_t locale)
 {
-	int32_t		len_uchar;
-	int32_t		len_conv;
-	UChar	   *buff_uchar;
-	UChar	   *buff_conv;
-	size_t		result_len;
-
-	len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-	len_conv = icu_convert_case(u_strToLower, locale,
-								&buff_conv, buff_uchar, len_uchar);
-	result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-	pfree(buff_uchar);
-	pfree(buff_conv);
-
-	return result_len;
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		UErrorCode	status = U_ZERO_ERROR;
+		UCaseMap   *casemap;
+		int32_t		needed;
+
+		casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+		if (U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+		status = U_ZERO_ERROR;
+		needed = ucasemap_utf8ToLower(casemap, dest, destsize, src, srclen, &status);
+		ucasemap_close(casemap);
+		if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("case conversion failed: %s", u_errorName(status))));
+		return needed;
+	}
+	else
+	{
+		int32_t		len_uchar;
+		int32_t		len_conv;
+		UChar	   *buff_uchar;
+		UChar	   *buff_conv;
+		size_t		result_len;
+
+		len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+		len_conv = icu_convert_case_uchar(u_strToLower, locale, &buff_conv,
+										  buff_uchar, len_uchar);
+		result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+		pfree(buff_uchar);
+		pfree(buff_conv);
+
+		return result_len;
+	}
 }
 
 size_t
 strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			 pg_locale_t locale)
 {
-	int32_t		len_uchar;
-	int32_t		len_conv;
-	UChar	   *buff_uchar;
-	UChar	   *buff_conv;
-	size_t		result_len;
-
-	len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-	len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
-								&buff_conv, buff_uchar, len_uchar);
-	result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-	pfree(buff_uchar);
-	pfree(buff_conv);
-
-	return result_len;
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		UErrorCode	status = U_ZERO_ERROR;
+		UCaseMap   *casemap;
+		int32_t		needed;
+
+		casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+		if (U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+		status = U_ZERO_ERROR;
+		needed = ucasemap_utf8ToTitle(casemap, dest, destsize, src, srclen, &status);
+		ucasemap_close(casemap);
+		if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("case conversion failed: %s", u_errorName(status))));
+		return needed;
+	}
+	else
+	{
+		int32_t		len_uchar;
+		int32_t		len_conv;
+		UChar	   *buff_uchar;
+		UChar	   *buff_conv;
+		size_t		result_len;
+
+		len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+		len_conv = icu_convert_case_uchar(u_strToTitle_default_BI, locale, &buff_conv,
+										  buff_uchar, len_uchar);
+		result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+		pfree(buff_uchar);
+		pfree(buff_conv);
+
+		return result_len;
+	}
 }
 
 size_t
 strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			 pg_locale_t locale)
 {
-	int32_t		len_uchar;
-	int32_t		len_conv;
-	UChar	   *buff_uchar;
-	UChar	   *buff_conv;
-	size_t		result_len;
-
-	len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-	len_conv = icu_convert_case(u_strToUpper, locale,
-								&buff_conv, buff_uchar, len_uchar);
-	result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-	pfree(buff_uchar);
-	pfree(buff_conv);
-
-	return result_len;
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		UErrorCode	status = U_ZERO_ERROR;
+		UCaseMap   *casemap;
+		int32_t		needed;
+
+		casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+		if (U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+		status = U_ZERO_ERROR;
+		needed = ucasemap_utf8ToUpper(casemap, dest, destsize, src, srclen, &status);
+		ucasemap_close(casemap);
+		if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("case conversion failed: %s", u_errorName(status))));
+		return needed;
+	}
+	else
+	{
+		int32_t		len_uchar;
+		int32_t		len_conv;
+		UChar	   *buff_uchar;
+		UChar	   *buff_conv;
+		size_t		result_len;
+
+		len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+		len_conv = icu_convert_case_uchar(u_strToUpper, locale, &buff_conv,
+										  buff_uchar, len_uchar);
+		result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+		pfree(buff_uchar);
+		pfree(buff_conv);
+
+		return result_len;
+	}
 }
 
 /*
@@ -599,8 +666,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
 }
 
 static int32_t
-icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
-				 UChar **buff_dest, UChar *buff_source, int32_t len_source)
+icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
+					   UChar **buff_dest, UChar *buff_source, int32_t len_source)
 {
 	UErrorCode	status;
 	int32_t		len_dest;
-- 
2.45.2

