From 906282444a5f9eba1067d3515de7ab594ef0d900 Mon Sep 17 00:00:00 2001
From: Yugo Nagata <nagata@sraoss.co.jp>
Date: Thu, 19 Dec 2024 23:41:07 +0900
Subject: [PATCH v2 2/3] Allow ILIKE forward matching to use btree index

Previously, ILIKE could not use btree index if the pattern
has case-varying characters. To enable it, the planner
support function converts an ILINK operator expression to OR
clause that contains two index clauses for the upper letter and
the lower letter respectively.

For example, "t ILIKE '123foo%'" can be converted to
"(t <= '123F'AND t > '123G') OR (t <= '123f' AND t < '123g')",
and bitmap index scan plan could be used for this.
---
 src/backend/utils/adt/like_support.c          | 141 ++++++++++++++----
 src/test/regress/expected/btree_index.out     |  15 +-
 .../regress/expected/collate.icu.utf8.out     |  22 ++-
 src/test/regress/sql/collate.icu.utf8.sql     |   4 +-
 4 files changed, 137 insertions(+), 45 deletions(-)

diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 8fdc677371..2344615038 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -66,7 +66,10 @@ typedef enum
 
 typedef enum
 {
-	Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact,
+	Pattern_Prefix_None,
+	Pattern_Prefix_Partial,
+	Pattern_Prefix_Partial_IC,
+	Pattern_Prefix_Exact,
 } Pattern_Prefix_Status;
 
 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
@@ -245,7 +248,7 @@ match_pattern_prefix(Node *leftop,
 					 Oid opfamily,
 					 Oid indexcollation)
 {
-	List	   *result;
+	List	   *result = NIL;
 	Const	   *patt;
 	Const	   *prefix;
 	Pattern_Prefix_Status pstatus;
@@ -259,6 +262,8 @@ match_pattern_prefix(Node *leftop,
 	Expr	   *expr;
 	FmgrInfo	ltproc;
 	Const	   *greaterstr;
+	List	   *prefixes;
+	ListCell   *lc;
 
 	/*
 	 * Can't do anything with a non-constant or NULL pattern argument.
@@ -434,35 +439,112 @@ match_pattern_prefix(Node *leftop,
 		return NIL;
 
 	/*
-	 * We can always say "x >= prefix".
+	 * If the pattern for case-insensiive matching has a case-varying
+	 * character, make two prefixes including the upper letter and the
+	 * lower letter respectively. For example, if the pattern is
+	 * '123foo%', we get '123F' and '123f' as prefixes.
 	 */
-	if (!op_in_opfamily(geopr, opfamily))
-		return NIL;
-	expr = make_opclause(geopr, BOOLOID, false,
-						 (Expr *) leftop, (Expr *) prefix,
-						 InvalidOid, indexcollation);
-	result = list_make1(expr);
-
-	/*-------
-	 * If we can create a string larger than the prefix, we can say
-	 * "x < greaterstr".  NB: we rely on make_greater_string() to generate
-	 * a guaranteed-greater string, not just a probably-greater string.
-	 * In general this is only guaranteed in C locale, so we'd better be
-	 * using a C-locale index collation.
-	 *-------
-	 */
-	if (!op_in_opfamily(ltopr, opfamily))
-		return result;
-	fmgr_info(get_opcode(ltopr), &ltproc);
-	greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
-	if (greaterstr)
+	if (pstatus == Pattern_Prefix_Partial_IC)
+	{
+		int			prefix_len;
+		Datum		first_letter;
+		Datum		upper_letter;
+		Datum		lower_letter;
+		const char *prefix_l;
+		const char *prefix_u;
+
+		Assert(ptype == Pattern_Type_Like_IC);
+
+		/* get the first case-varying characketer in the pattern */
+		prefix_len = DatumGetInt32(DirectFunctionCall1Coll(textlen,
+														   indexcollation,
+														   prefix->constvalue));
+		first_letter = DirectFunctionCall3Coll(text_substr, indexcollation,
+											   patt->constvalue,
+											   Int32GetDatum(prefix_len + 1),
+											   Int32GetDatum(1));
+
+
+		/* prefix followed by the upper letter */
+		upper_letter = DirectFunctionCall1Coll(upper, indexcollation,
+											   first_letter);
+		prefix_u = TextDatumGetCString(DirectFunctionCall2Coll(textcat,
+															   indexcollation,
+															   prefix->constvalue,
+															   upper_letter));
+		prefixes = list_make1(string_to_const(prefix_u, prefix->consttype));
+
+		/* prefix followed by the lower letter */
+		lower_letter = DirectFunctionCall1Coll(lower, indexcollation,
+											   first_letter);
+		prefix_l = TextDatumGetCString(DirectFunctionCall2Coll(textcat,
+															   indexcollation,
+															   prefix->constvalue,
+															   lower_letter));
+		if (!DatumGetBool(DirectFunctionCall2Coll(texteq,
+										   indexcollation,
+										   CStringGetTextDatum(prefix_u),
+										   CStringGetTextDatum(prefix_l))))
+			prefixes = lappend(prefixes, string_to_const(prefix_l, prefix->consttype));
+	}
+	else
+		prefixes = list_make1(prefix);
+
+	foreach (lc, prefixes)
 	{
-		expr = make_opclause(ltopr, BOOLOID, false,
-							 (Expr *) leftop, (Expr *) greaterstr,
+		List  *clauses;
+
+		prefix = (Const *) lfirst(lc);
+
+		/*
+		 * We can always say "x >= prefix".
+		 */
+		if (!op_in_opfamily(geopr, opfamily))
+			return NIL;
+		expr = make_opclause(geopr, BOOLOID, false,
+							 (Expr *) leftop, (Expr *) prefix,
 							 InvalidOid, indexcollation);
-		result = lappend(result, expr);
+		clauses = list_make1(expr);
+
+		/*-------
+		 * If we can create a string larger than the prefix, we can say
+		 * "x < greaterstr".  NB: we rely on make_greater_string() to generate
+		 * a guaranteed-greater string, not just a probably-greater string.
+		 * In general this is only guaranteed in C locale, so we'd better be
+		 * using a C-locale index collation.
+		 *-------
+		 */
+		if (op_in_opfamily(ltopr, opfamily))
+		{
+			fmgr_info(get_opcode(ltopr), &ltproc);
+			greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
+			if (greaterstr)
+			{
+				expr = make_opclause(ltopr, BOOLOID, false,
+									 (Expr *) leftop, (Expr *) greaterstr,
+									 InvalidOid, indexcollation);
+				clauses = lappend(clauses, expr);
+			}
+		}
+
+		/*
+		 * If case-insensitive pattern matching generated two clauses for
+		 * uppder and lower letters, convert them to explicit AND because
+		 * they will be under OR later.
+		 */
+		if (pstatus == Pattern_Prefix_Partial_IC && list_length(prefixes) > 1)
+			result = lappend(result, make_ands_explicit(clauses));
+		else
+			result = clauses;
 	}
 
+	/*
+	 * If case-insensitive pattern matching generated two clauses for upper and
+	 * lower letters, OR them.
+	 */
+	if (pstatus == Pattern_Prefix_Partial_IC && list_length(prefixes) > 1)
+		result = list_make1(make_orclause(result));
+
 	return result;
 }
 
@@ -997,6 +1079,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				match_pos;
 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 	pg_locale_t locale = 0;
+	bool		has_case_varying = false;
 
 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -1058,7 +1141,10 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 		/* Stop if case-varying character (it's sort of a wildcard) */
 		if (case_insensitive &&
 			pattern_char_isalpha(patt[pos], is_multibyte, locale))
+		{
+			has_case_varying = true;
 			break;
+		}
 
 		match[match_pos++] = patt[pos];
 	}
@@ -1077,6 +1163,9 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 	pfree(patt);
 	pfree(match);
 
+	if (case_insensitive && has_case_varying)
+		return Pattern_Prefix_Partial_IC;
+
 	/* in LIKE, an empty pattern is an exact match! */
 	if (pos == pattlen)
 		return Pattern_Prefix_Exact;	/* reached end of pattern, so exact */
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 8879554c3f..6ffad39dad 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -459,14 +459,19 @@ select proname from pg_proc where proname ilike '00%foo' order by 1;
 
 explain (costs off)
 select proname from pg_proc where proname ilike 'ri%foo' order by 1;
-                  QUERY PLAN                  
-----------------------------------------------
+                                                                                             QUERY PLAN                                                                                             
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: proname
-   ->  Seq Scan on pg_proc
-         Disabled: true
+   ->  Bitmap Heap Scan on pg_proc
+         Recheck Cond: (((proname >= 'R'::text) AND (proname < 'S'::text) AND (proname ~~* 'ri%foo'::text)) OR ((proname >= 'r'::text) AND (proname < 's'::text) AND (proname ~~* 'ri%foo'::text)))
          Filter: (proname ~~* 'ri%foo'::text)
-(5 rows)
+         ->  BitmapOr
+               ->  Bitmap Index Scan on pg_proc_proname_args_nsp_index
+                     Index Cond: ((proname >= 'R'::text) AND (proname < 'S'::text))
+               ->  Bitmap Index Scan on pg_proc_proname_args_nsp_index
+                     Index Cond: ((proname >= 'r'::text) AND (proname < 's'::text))
+(10 rows)
 
 reset enable_seqscan;
 reset enable_indexscan;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index d4f327636f..f13f9e1245 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -988,14 +988,13 @@ SELECT relname, pg_get_indexdef(oid) FROM pg_class WHERE relname LIKE 'collate_t
 set enable_seqscan = off;
 explain (costs off)
 select * from collate_test1 where b ilike 'abc';
-          QUERY PLAN           
--------------------------------
- Seq Scan on collate_test1
-   Disabled: true
+                      QUERY PLAN                      
+------------------------------------------------------
+ Index Scan using collate_test1_idx3 on collate_test1
    Filter: (b ~~* 'abc'::text)
-(3 rows)
+(2 rows)
 
-select * from collate_test1 where b ilike 'abc';
+select * from collate_test1 where b ilike 'abc' order by 1;
  a |  b  
 ---+-----
  1 | abc
@@ -1004,14 +1003,13 @@ select * from collate_test1 where b ilike 'abc';
 
 explain (costs off)
 select * from collate_test1 where b ilike 'ABC';
-          QUERY PLAN           
--------------------------------
- Seq Scan on collate_test1
-   Disabled: true
+                      QUERY PLAN                      
+------------------------------------------------------
+ Index Scan using collate_test1_idx3 on collate_test1
    Filter: (b ~~* 'ABC'::text)
-(3 rows)
+(2 rows)
 
-select * from collate_test1 where b ilike 'ABC';
+select * from collate_test1 where b ilike 'ABC' order by 1;
  a |  b  
 ---+-----
  1 | abc
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 5ee2da4e0e..948201c3e1 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -344,10 +344,10 @@ SELECT relname, pg_get_indexdef(oid) FROM pg_class WHERE relname LIKE 'collate_t
 set enable_seqscan = off;
 explain (costs off)
 select * from collate_test1 where b ilike 'abc';
-select * from collate_test1 where b ilike 'abc';
+select * from collate_test1 where b ilike 'abc' order by 1;
 explain (costs off)
 select * from collate_test1 where b ilike 'ABC';
-select * from collate_test1 where b ilike 'ABC';
+select * from collate_test1 where b ilike 'ABC' order by 1;
 reset enable_seqscan;
 
 
-- 
2.34.1