BUG #15285: Query used index over field with ICU collation in some cases wrongly return 0 rows
Hi,
I'm bumping this thread on pgsql-hacker, hopefully it will drag some more
opinions/discussions.
Should we try to fix this issue or not? This is clearly an upstream bug. It has
been reported, including regression tests, but this doesn't move since 2 years
now.
If we choose not to fix it on our side using eg a workaround (see patch), I
suppose this small bug should be documented somewhere so people are not lost
alone in the wild.
Opinions?
Regards,
Begin forwarded message:
Date: Sat, 13 Jun 2020 00:43:22 +0200
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
To: Thomas Munro <thomas.munro@gmail.com>, Peter Geoghegan <pg@bowt.ie>
Cc: Роман Литовченко <roman.lytovchenko@gmail.com>, PostgreSQL mailing lists
<pgsql-bugs@lists.postgresql.org> Subject: Re: BUG #15285: Query used index
over field with ICU collation in some cases wrongly return 0 rows
On Fri, 12 Jun 2020 18:40:55 +0200
Jehan-Guillaume de Rorthais <jgdr@dalibo.com> wrote:
On Wed, 10 Jun 2020 00:29:33 +0200
Jehan-Guillaume de Rorthais <jgdr@dalibo.com> wrote:
[...]After playing with ICU regression tests, I found functions ucol_strcollIter
and ucol_nextSortKeyPart are safe. I'll do some performance tests and report
here.I did some benchmarks. See attachment for the script and its header to
reproduce.It sorts 935895 french phrases from 0 to 122 chars with an average of 49.
Performance tests were done on current master HEAD (buggy) and using the patch
in attachment, relying on ucol_strcollIter.My preliminary test with ucol_getSortKey was catastrophic, as we might
expect. 15-17x slower than the current HEAD. So I removed it from actual
tests. I didn't try with ucol_nextSortKeyPart though.Using ucol_strcollIter performs ~20% slower than HEAD on UTF8 databases, but
this might be acceptable. Here are the numbers:DB Encoding HEAD strcollIter ratio
UTF8 2.74 3.27 1.19x
LATIN1 5.34 5.40 1.01xI plan to add a regression test soon.
Please, find in attachment the second version of the patch, with a
regression test.
Regards,
--
Jehan-Guillaume de Rorthais
Dalibo
Attachments:
v2-0001-Replace-buggy-ucol_strcoll-funcs-with-ucol_strcollIt.patchtext/x-patchDownload
From bb428135490caafe23562e3dcd9925d7d7c5a7be Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Thu, 11 Jun 2020 18:08:31 +0200
Subject: [PATCH] Replace buggy ucol_strcoll* funcs with ucol_strcollIter
Functions ucol_strcoll and ucol_strcollUTF8 are breaking some collation rules.
This leads to wrong sort order or wrong result from index using such
collations. See bug report #15285 for details:
http://postgr.es/m/153201618542.1404.3611626898935613264%40wrigleys.postgresql.org
This fix replace ucol_strcoll* functions with ucol_strcollIter() which is not
affected by this bug.
Reported-by: Roman Lytovchenko
Analysed-by: Peter Geoghegan
Author: Jehan-Guillaume de Rorthais
---
src/backend/utils/adt/varlena.c | 54 ++++++++++++-------
src/include/utils/pg_locale.h | 14 -----
.../regress/expected/collate.icu.utf8.out | 12 +++++
src/test/regress/sql/collate.icu.utf8.sql | 8 +++
4 files changed, 56 insertions(+), 32 deletions(-)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231..f156c00a1a 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1638,34 +1638,43 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
if (mylocale->provider == COLLPROVIDER_ICU)
{
#ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
if (GetDatabaseEncoding() == PG_UTF8)
{
UErrorCode status;
+ UCharIterator iter1,
+ iter2;
status = U_ZERO_ERROR;
- result = ucol_strcollUTF8(mylocale->info.icu.ucol,
- arg1, len1,
- arg2, len2,
- &status);
+
+ uiter_setUTF8(&iter1, arg1, len1);
+ uiter_setUTF8(&iter2, arg2, len2);
+
+ result = ucol_strcollIter(mylocale->info.icu.ucol,
+ &iter1, &iter2, &status);
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("collation failed: %s", u_errorName(status))));
}
else
-#endif
{
+ UErrorCode status;
+ UCharIterator iter1,
+ iter2;
int32_t ulen1,
ulen2;
UChar *uchar1,
*uchar2;
+ status = U_ZERO_ERROR;
+
ulen1 = icu_to_uchar(&uchar1, arg1, len1);
ulen2 = icu_to_uchar(&uchar2, arg2, len2);
- result = ucol_strcoll(mylocale->info.icu.ucol,
- uchar1, ulen1,
- uchar2, ulen2);
+ uiter_setString(&iter1, uchar1, ulen1);
+ uiter_setString(&iter2, uchar2, ulen2);
+
+ result = ucol_strcollIter(mylocale->info.icu.ucol,
+ &iter1, &iter2, &status);
pfree(uchar1);
pfree(uchar2);
@@ -2352,34 +2361,43 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
if (sss->locale->provider == COLLPROVIDER_ICU)
{
#ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
if (GetDatabaseEncoding() == PG_UTF8)
{
UErrorCode status;
+ UCharIterator iter1,
+ iter2;
status = U_ZERO_ERROR;
- result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
- a1p, len1,
- a2p, len2,
- &status);
+
+ uiter_setUTF8(&iter1, a1p, len1);
+ uiter_setUTF8(&iter2, a2p, len2);
+
+ result = ucol_strcollIter(sss->locale->info.icu.ucol,
+ &iter1, &iter2, &status);
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("collation failed: %s", u_errorName(status))));
}
else
-#endif
{
+ UErrorCode status;
+ UCharIterator iter1,
+ iter2;
int32_t ulen1,
ulen2;
UChar *uchar1,
*uchar2;
+ status = U_ZERO_ERROR;
+
ulen1 = icu_to_uchar(&uchar1, a1p, len1);
ulen2 = icu_to_uchar(&uchar2, a2p, len2);
- result = ucol_strcoll(sss->locale->info.icu.ucol,
- uchar1, ulen1,
- uchar2, ulen2);
+ uiter_setString(&iter1, uchar1, ulen1);
+ uiter_setString(&iter2, uchar2, ulen2);
+
+ result = ucol_strcollIter(sss->locale->info.icu.ucol,
+ &iter1, &iter2, &status);
pfree(uchar1);
pfree(uchar2);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 9cb7d91ddf..2b3ec2c597 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -21,20 +21,6 @@
#include "utils/guc.h"
-#ifdef USE_ICU
-/*
- * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
- * (see
- * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
- */
-#if U_ICU_VERSION_MAJOR_NUM >= 53
-#define HAVE_UCOL_STRCOLLUTF8 1
-#else
-#undef HAVE_UCOL_STRCOLLUTF8
-#endif
-#endif
-
-
/* GUC settings */
extern char *locale_messages;
extern char *locale_monetary;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 2b86ce9028..06cdb948eb 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1900,6 +1900,18 @@ SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
t
(1 row)
+CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit');
+CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast);
+INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD');
+CREATE INDEX ON test34(b);
+SET enable_seqscan TO off;
+SELECT * FROM test34 WHERE b = '0000' ;
+ b
+------
+ 0000
+(1 row)
+
+RESET enable_seqscan;
-- cleanup
RESET search_path;
SET client_min_messages TO warning;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 67de7d9794..50d0be70a8 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -722,6 +722,14 @@ INSERT INTO test33 VALUES (2, 'DEF');
SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
+CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit');
+CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast);
+INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD');
+CREATE INDEX ON test34(b);
+SET enable_seqscan TO off;
+SELECT * FROM test34 WHERE b = '0000' ;
+RESET enable_seqscan;
+
-- cleanup
RESET search_path;
SET client_min_messages TO warning;
--
2.20.1