From b7deb09e0717ee76d98fd1a40cff4b749bfe1c36 Mon Sep 17 00:00:00 2001
From: David Geier <geidav.pg@gmail.com>
Date: Fri, 14 Nov 2025 11:37:40 +0100
Subject: [PATCH v3 6/6] Add ASCII fastpath to generate_trgm_only()

---
 contrib/pg_trgm/trgm_op.c | 124 ++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 39b586f5b9a..d2087b3a45e 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -226,32 +226,6 @@ show_limit(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(similarity_threshold);
 }
 
-/*
- * Finds first word in string, returns pointer to the word,
- * endword points to the character after word
- */
-static char *
-find_word(char *str, int lenstr, char **endword, int *charlen)
-{
-	char	   *beginword = str;
-
-	while (beginword - str < lenstr && !ISWORDCHR(beginword))
-		beginword += pg_mblen(beginword);
-
-	if (beginword - str >= lenstr)
-		return NULL;
-
-	*endword = beginword;
-	*charlen = 0;
-	while (*endword - str < lenstr && ISWORDCHR(*endword))
-	{
-		*endword += pg_mblen(*endword);
-		(*charlen)++;
-	}
-
-	return beginword;
-}
-
 /*
  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
  * which is always exactly three bytes.  If we have three single-byte
@@ -337,58 +311,90 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
 static int
 generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 {
-	trgm	   *tptr;
-	char	   *buf;
-	int			charlen,
-				bytelen;
-	char	   *bword,
-			   *eword;
+	trgm *tptr = trg;
+	char *buf;
 
 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
 		return 0;
 
-	tptr = trg;
-
-	/* Allocate a buffer for case-folded, blank-padded words */
-	buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
+	buf = palloc_array(char, slen * pg_database_encoding_max_length() + 4 + 1);
+	memset(buf, ' ', LPADDING);
 
-	if (LPADDING > 0)
+	for (int i = 0; i < slen; )
 	{
-		*buf = ' ';
-		if (LPADDING > 1)
-			*(buf + 1) = ' ';
-	}
+		int num_bytes = LPADDING;
+		int num_chars = LPADDING;
+		char *word;
 
-	eword = str;
-	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
-	{
+		/* Extract next word */
+		while (i < slen)
+		{
+			if ((str[i] & 0x80) == 0) /* Fast path for ASCII-only */
+			{
+				if (isalnum(str[i]))
+				{
 #ifdef IGNORECASE
-		bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
-		bytelen = strlen(bword);
+					buf[num_bytes++] = pg_ascii_tolower(str[i++]);
 #else
-		bytelen = eword - bword;
+					buf[num_bytes++] = str[i++];
 #endif
+				}
+				else
+				{
+					i++;
+					break;
+				}
+			}
+			else
+			{
+				const int mblen = pg_mblen(str + i);
+				Assert(mblen >= 2); /* Otherwise, it would be ASCII */
+
+				if (ISWORDCHR(str + i))
+				{
+					memcpy(buf + num_bytes, str + i, mblen);
+					num_bytes += mblen;
+					i += mblen;
+				}
+				else
+				{
+					i += mblen;
+					break;
+				}
+			}
+
+			num_chars++;
+		}
 
-		memcpy(buf + LPADDING, bword, bytelen);
+		if (num_chars > LPADDING)
+		{
+			memset(buf + num_bytes, ' ', RPADDING);
+			num_bytes += RPADDING;
+			num_chars += RPADDING;
+			word = buf;
 
 #ifdef IGNORECASE
-		pfree(bword);
+			if (num_chars != num_bytes)
+			{
+				word = str_tolower(buf, num_bytes, DEFAULT_COLLATION_OID);
+				num_bytes = strlen(word); /* String can get shorter from lower-casing */
+			}
 #endif
 
-		buf[LPADDING + bytelen] = ' ';
-		buf[LPADDING + bytelen + 1] = ' ';
+			if (bounds)
+				bounds[tptr - trg] |= TRGM_BOUND_LEFT;
+
+			tptr = make_trigrams(tptr, word, num_bytes, num_chars);
+
+			if (bounds)
+				bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
 
-		/* Calculate trigrams marking their bounds if needed */
-		if (bounds)
-			bounds[tptr - trg] |= TRGM_BOUND_LEFT;
-		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
-							 charlen + LPADDING + RPADDING);
-		if (bounds)
-			bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
+			if (word != buf)
+				pfree(word);
+		}
 	}
 
 	pfree(buf);
-
 	return tptr - trg;
 }
 
-- 
2.51.0

