From 92ef288363460f8e1bdce2409c507550b9adcf86 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Fri, 8 Aug 2025 13:36:48 -0700
Subject: [PATCH v3j 1/4] Performance testing infrastructure for normalization.

---
 src/common/unicode/meson.build |   2 +-
 src/common/unicode/norm_test.c | 211 +++++++++++++++++++++++++++++++++
 src/common/unicode_norm.c      |  16 +--
 3 files changed, 221 insertions(+), 8 deletions(-)

diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build
index c6a4715ccc5..783a4ad2581 100644
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@@ -126,7 +126,7 @@ category_test = executable('category_test',
 
 norm_test = executable('norm_test',
   ['norm_test.c', norm_test_table],
-  dependencies: [frontend_port_code, libintl],
+  dependencies: [frontend_port_code, icu, libintl],
   include_directories: inc,
   link_with: [common_static, pgport_static],
   build_by_default: false,
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..743a76e2bfd 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -14,11 +14,15 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
+#include <unicode/unorm.h>
+#include <unicode/ustring.h>
 
 #include "common/unicode_norm.h"
 
 #include "norm_test_table.h"
 
+
 static char *
 print_wchar_str(const pg_wchar *s)
 {
@@ -56,6 +60,211 @@ pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
 	}
 }
 
+#define BUFSIZE (16*1024*1024)
+#define ITER 100
+
+/*
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
+ */
+static int
+utf_mblen(const unsigned char *s)
+{
+	int			len;
+
+	if ((*s & 0x80) == 0)
+		len = 1;
+	else if ((*s & 0xe0) == 0xc0)
+		len = 2;
+	else if ((*s & 0xf0) == 0xe0)
+		len = 3;
+	else if ((*s & 0xf8) == 0xf0)
+		len = 4;
+#ifdef NOT_USED
+	else if ((*s & 0xfc) == 0xf8)
+		len = 5;
+	else if ((*s & 0xfe) == 0xfc)
+		len = 6;
+#endif
+	else
+		len = 1;
+	return len;
+}
+
+static void
+perf(void)
+{
+	for (int composed = 0; composed < 2; composed++)
+	{
+		const char *envname = composed ? "NORM_PERF_NFC_FILE" : "NORM_PERF_NFD_FILE";
+		const char *filename = getenv(envname);
+		char *utf8_input = malloc(BUFSIZE);
+		size_t utf8_input_len;
+		pg_wchar *w_input = malloc(BUFSIZE * sizeof(pg_wchar));
+		int w_input_len;
+		FILE *file;
+		unsigned char *p;
+#ifdef USE_ICU
+		UChar *u_input = malloc(BUFSIZE * sizeof(UChar));
+		int u_input_len;
+		UErrorCode status;
+#endif
+
+		if (!filename)
+		{
+			fprintf(stderr, "Must set environment variable %s\n", envname);
+			exit(1);
+		}
+		file = fopen(filename, "r");
+		if (!file)
+			printf("Error opening file %s: %s\n", filename, strerror(errno));
+
+		utf8_input_len = fread(utf8_input, 1, BUFSIZE-1, file);
+		if (ferror(file))
+		{
+			fprintf(stderr, "read error: %s\n", strerror(errno));
+			exit(1);
+		}
+		if (!feof(file))
+		{
+			fprintf(stderr, "read %zu bytes from %s without reaching EOF\n", utf8_input_len, filename);
+			exit(1);
+		}
+
+		printf("Read %zu bytes from %s\n", utf8_input_len, filename);
+		utf8_input[utf8_input_len] = '\0';
+
+		fclose(file);
+
+		p = (unsigned char *) utf8_input;
+		w_input_len = 0;
+		while (*p)
+		{
+			w_input[w_input_len++] = utf8_to_unicode(p);
+			p += utf_mblen(p);
+		}
+		w_input[w_input_len] = (pg_wchar) '\0';
+
+		printf("Length of w_input: %d\n", w_input_len);
+
+#ifdef USE_ICU
+		status = U_ZERO_ERROR;
+		u_strFromUTF8(u_input, BUFSIZE, &u_input_len, utf8_input, -1, &status);
+		if (U_FAILURE(status))
+		{
+			fprintf(stderr, "error converting from UTF8 to UChar: %s\n", u_errorName(status));
+			exit(1);
+		}
+		printf("Length of u_input: %d\n", u_input_len);
+#endif
+
+		for (int compat = 0; compat < 2; compat++)
+		{
+			const char *target_form_txt = composed ?
+				(compat ? "NFKC" : "NFC") :
+				(compat ? "NFKD" : "NFD");
+
+			for (int icu = 0; icu < 2; icu++)
+			{
+				struct timeval tv0;
+				struct timeval tv1;
+				double total;
+
+				gettimeofday(&tv0, NULL);
+				for (int iter = 0; iter < ITER; iter++)
+				{
+#ifdef USE_ICU
+					if (icu)
+					{
+						UNormalizationMode form = composed ?
+							(compat ? UNORM_NFKC : UNORM_NFC) :
+							(compat ? UNORM_NFKD : UNORM_NFD);
+						UChar *u_result = malloc(BUFSIZE * sizeof(UChar));
+						status = U_ZERO_ERROR;
+						unorm_normalize(u_input, -1, form, 0, u_result, BUFSIZE, &status);
+						if (U_FAILURE(status))
+						{
+							fprintf(stderr, "Normalization Failure: %s\n", u_errorName(status));
+							exit(1);
+						}
+#ifdef VERIFY_RESULT
+						if (!compat)
+						{
+							char *utf8_result = malloc(BUFSIZE);
+							int utf8_result_len;
+
+							status = U_ZERO_ERROR;
+							u_strToUTF8(utf8_result, BUFSIZE, &utf8_result_len, u_result, -1, &status);
+							if (U_FAILURE(status))
+							{
+								fprintf(stderr, "strToUTF8 error: %s\n", u_errorName(status));
+								exit(1);
+							}
+							if (strcmp(utf8_result, utf8_input) != 0)
+							{
+								fprintf(stderr, "result doesn't match\n");
+								exit(1);
+							}
+						}
+#endif
+						free(u_result);
+					}
+#endif
+					if (!icu)
+					{
+						UnicodeNormalizationForm form = composed ?
+							(compat ? UNICODE_NFKC : UNICODE_NFC) :
+							(compat ? UNICODE_NFKD : UNICODE_NFD);
+						pg_wchar *w_result = unicode_normalize(form, w_input);
+#ifdef VERIFY_RESULT
+						if (!compat)
+						{
+							char *utf8_result = malloc(BUFSIZE);
+							int utf8_result_len;
+
+							p = (unsigned char *) utf8_result;
+							for (pg_wchar *wp = w_result; *wp; wp++)
+							{
+								unicode_to_utf8(*wp, p);
+								p += utf_mblen(p);
+							}
+
+							if (strcmp(utf8_result, utf8_input) != 0)
+							{
+								fprintf(stderr, "result doesn't match\n");
+								exit(1);
+							}
+						}
+#endif
+						free(w_result);
+					}
+				}
+				gettimeofday(&tv1, NULL);
+
+				total = (((double)tv1.tv_sec * 1000000.0 + tv1.tv_usec) -
+						 ((double)tv0.tv_sec * 1000000.0 + tv0.tv_usec)) / 1000000.0;
+
+				printf("Normalization from %s to %4s with %3s: %07.3f\n",
+					   composed ? "NFD" : "NFC",
+					   target_form_txt,
+					   icu ? "ICU" : "PG",
+					   total);
+			}
+		}
+
+		free(utf8_input);
+		free(w_input);
+		free(u_input);
+	}
+}
+
 int
 main(int argc, char **argv)
 {
@@ -81,6 +290,8 @@ main(int argc, char **argv)
 		}
 	}
 
+	perf();
+
 	printf("norm_test: All tests successful!\n");
 	exit(0);
 }
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..a3e51499e49 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -18,8 +18,10 @@
 #include "postgres_fe.h"
 #endif
 
+#define NORMALIZE_PERFECTHASH
+
 #include "common/unicode_norm.h"
-#ifndef FRONTEND
+#ifdef NORMALIZE_PERFECTHASH
 #include "common/unicode_norm_hashfunc.h"
 #include "common/unicode_normprops_table.h"
 #include "port/pg_bswap.h"
@@ -46,7 +48,7 @@
 #define NCOUNT		VCOUNT * TCOUNT
 #define SCOUNT		LCOUNT * NCOUNT
 
-#ifdef FRONTEND
+#ifndef NORMALIZE_PERFECTHASH
 /* comparison routine for bsearch() of decomposition lookup table. */
 static int
 conv_compare(const void *p1, const void *p2)
@@ -71,7 +73,7 @@ conv_compare(const void *p1, const void *p2)
 static const pg_unicode_decomposition *
 get_code_entry(pg_wchar code)
 {
-#ifndef FRONTEND
+#ifdef NORMALIZE_PERFECTHASH
 	int			h;
 	uint32		hashkey;
 	pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
@@ -254,7 +256,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
 		 * recomposed partially.  This lookup uses a perfect hash function for
 		 * the backend code.
 		 */
-#ifndef FRONTEND
+#ifdef NORMALIZE_PERFECTHASH
 
 		int			h,
 					inv_lookup_index;
@@ -304,7 +306,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
 				return true;
 			}
 		}
-#endif							/* !FRONTEND */
+#endif							/* NORMALIZE_PERFECTHASH */
 	}
 
 	return false;
@@ -537,7 +539,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
  */
 
 /* We only need this in the backend. */
-#ifndef FRONTEND
+#ifdef NORMALIZE_PERFECTHASH
 
 static const pg_unicode_normprops *
 qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
@@ -631,4 +633,4 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
 	return result;
 }
 
-#endif							/* !FRONTEND */
+#endif							/* NORMALIZE_PERFECTHASH */
-- 
2.43.0