BUG #15285: Query used index over field with ICU collation in some cases wrongly return 0 rows

Started by Jehan-Guillaume de Rorthaisover 5 years ago1 messages

jgdr@dalibo.com

over 5 years ago

1 attachment(s)

Hi,

I'm bumping this thread on pgsql-hacker, hopefully it will drag some more
opinions/discussions.

Should we try to fix this issue or not? This is clearly an upstream bug. It has
been reported, including regression tests, but this doesn't move since 2 years
now.

If we choose not to fix it on our side using eg a workaround (see patch), I
suppose this small bug should be documented somewhere so people are not lost
alone in the wild.

Opinions?

Regards,

Begin forwarded message:

Date: Sat, 13 Jun 2020 00:43:22 +0200
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
To: Thomas Munro <thomas.munro@gmail.com>, Peter Geoghegan <pg@bowt.ie>
Cc: Роман Литовченко <roman.lytovchenko@gmail.com>, PostgreSQL mailing lists
<pgsql-bugs@lists.postgresql.org> Subject: Re: BUG #15285: Query used index
over field with ICU collation in some cases wrongly return 0 rows

On Fri, 12 Jun 2020 18:40:55 +0200
Jehan-Guillaume de Rorthais <jgdr@dalibo.com> wrote:

On Wed, 10 Jun 2020 00:29:33 +0200
Jehan-Guillaume de Rorthais <jgdr@dalibo.com> wrote:
[...]

After playing with ICU regression tests, I found functions ucol_strcollIter
and ucol_nextSortKeyPart are safe. I'll do some performance tests and report
here.

I did some benchmarks. See attachment for the script and its header to
reproduce.

It sorts 935895 french phrases from 0 to 122 chars with an average of 49.
Performance tests were done on current master HEAD (buggy) and using the patch
in attachment, relying on ucol_strcollIter.

My preliminary test with ucol_getSortKey was catastrophic, as we might
expect. 15-17x slower than the current HEAD. So I removed it from actual
tests. I didn't try with ucol_nextSortKeyPart though.

Using ucol_strcollIter performs ~20% slower than HEAD on UTF8 databases, but
this might be acceptable. Here are the numbers:

DB Encoding HEAD strcollIter ratio
UTF8 2.74 3.27 1.19x
LATIN1 5.34 5.40 1.01x

I plan to add a regression test soon.

Please, find in attachment the second version of the patch, with a
regression test.

Regards,

--
Jehan-Guillaume de Rorthais
Dalibo

Attachments:

v2-0001-Replace-buggy-ucol_strcoll-funcs-with-ucol_strcollIt.patchtext/x-patchDownload

From bb428135490caafe23562e3dcd9925d7d7c5a7be Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Thu, 11 Jun 2020 18:08:31 +0200
Subject: [PATCH] Replace buggy ucol_strcoll* funcs with ucol_strcollIter

Functions ucol_strcoll and ucol_strcollUTF8 are breaking some collation rules.
This leads to wrong sort order or wrong result from index using such
collations. See bug report #15285 for details:
http://postgr.es/m/153201618542.1404.3611626898935613264%40wrigleys.postgresql.org

This fix replace ucol_strcoll* functions with ucol_strcollIter() which is not
affected by this bug.

Reported-by: Roman Lytovchenko
Analysed-by: Peter Geoghegan
Author: Jehan-Guillaume de Rorthais
---
 src/backend/utils/adt/varlena.c               | 54 ++++++++++++-------
 src/include/utils/pg_locale.h                 | 14 -----
 .../regress/expected/collate.icu.utf8.out     | 12 +++++
 src/test/regress/sql/collate.icu.utf8.sql     |  8 +++
 4 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231..f156c00a1a 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1638,34 +1638,43 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 			if (mylocale->provider == COLLPROVIDER_ICU)
 			{
 #ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
 				if (GetDatabaseEncoding() == PG_UTF8)
 				{
 					UErrorCode	status;
+					UCharIterator	iter1,
+									iter2;
 
 					status = U_ZERO_ERROR;
-					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
-											  arg1, len1,
-											  arg2, len2,
-											  &status);
+
+					uiter_setUTF8(&iter1, arg1, len1);
+					uiter_setUTF8(&iter2, arg2, len2);
+
+					result = ucol_strcollIter(mylocale->info.icu.ucol,
+											  &iter1, &iter2, &status);
 					if (U_FAILURE(status))
 						ereport(ERROR,
 								(errmsg("collation failed: %s", u_errorName(status))));
 				}
 				else
-#endif
 				{
+					UErrorCode	status;
+					UCharIterator	iter1,
+									iter2;
 					int32_t		ulen1,
 								ulen2;
 					UChar	   *uchar1,
 							   *uchar2;
 
+					status = U_ZERO_ERROR;
+
 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
 
-					result = ucol_strcoll(mylocale->info.icu.ucol,
-										  uchar1, ulen1,
-										  uchar2, ulen2);
+					uiter_setString(&iter1, uchar1, ulen1);
+					uiter_setString(&iter2, uchar2, ulen2);
+
+					result = ucol_strcollIter(mylocale->info.icu.ucol,
+											  &iter1, &iter2, &status);
 
 					pfree(uchar1);
 					pfree(uchar2);
@@ -2352,34 +2361,43 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 		if (sss->locale->provider == COLLPROVIDER_ICU)
 		{
 #ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
 			if (GetDatabaseEncoding() == PG_UTF8)
 			{
 				UErrorCode	status;
+				UCharIterator	iter1,
+								iter2;
 
 				status = U_ZERO_ERROR;
-				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
-										  a1p, len1,
-										  a2p, len2,
-										  &status);
+
+				uiter_setUTF8(&iter1, a1p, len1);
+				uiter_setUTF8(&iter2, a2p, len2);
+
+				result = ucol_strcollIter(sss->locale->info.icu.ucol,
+										  &iter1, &iter2, &status);
 				if (U_FAILURE(status))
 					ereport(ERROR,
 							(errmsg("collation failed: %s", u_errorName(status))));
 			}
 			else
-#endif
 			{
+				UErrorCode	status;
+				UCharIterator	iter1,
+								iter2;
 				int32_t		ulen1,
 							ulen2;
 				UChar	   *uchar1,
 						   *uchar2;
 
+				status = U_ZERO_ERROR;
+
 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
 
-				result = ucol_strcoll(sss->locale->info.icu.ucol,
-									  uchar1, ulen1,
-									  uchar2, ulen2);
+				uiter_setString(&iter1, uchar1, ulen1);
+				uiter_setString(&iter2, uchar2, ulen2);
+
+				result = ucol_strcollIter(sss->locale->info.icu.ucol,
+										  &iter1, &iter2, &status);
 
 				pfree(uchar1);
 				pfree(uchar2);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 9cb7d91ddf..2b3ec2c597 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -21,20 +21,6 @@
 
 #include "utils/guc.h"
 
-#ifdef USE_ICU
-/*
- * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
- * (see
- * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
- */
-#if U_ICU_VERSION_MAJOR_NUM >= 53
-#define HAVE_UCOL_STRCOLLUTF8 1
-#else
-#undef HAVE_UCOL_STRCOLLUTF8
-#endif
-#endif
-
-
 /* GUC settings */
 extern char *locale_messages;
 extern char *locale_monetary;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 2b86ce9028..06cdb948eb 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1900,6 +1900,18 @@ SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
  t
 (1 row)
 
+CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit');
+CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast);
+INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD');
+CREATE INDEX ON test34(b);
+SET enable_seqscan TO off;
+SELECT * FROM test34 WHERE b = '0000' ;
+  b   
+------
+ 0000
+(1 row)
+
+RESET enable_seqscan;
 -- cleanup
 RESET search_path;
 SET client_min_messages TO warning;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 67de7d9794..50d0be70a8 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -722,6 +722,14 @@ INSERT INTO test33 VALUES (2, 'DEF');
 SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
 
 
+CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit');
+CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast);
+INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD');
+CREATE INDEX ON test34(b);
+SET enable_seqscan TO off;
+SELECT * FROM test34 WHERE b = '0000' ;
+RESET enable_seqscan;
+
 -- cleanup
 RESET search_path;
 SET client_min_messages TO warning;
-- 
2.20.1