update substring pattern matching syntax
At
<https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#Obsolete_syntax_for_substring.28.29>
it is described that the substring pattern matching syntax in PostgreSQL
does not conform to the current standard. PostgreSQL implements
SUBSTRING(text FROM pattern FOR escapechar)
whereas the current standard says
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
The former was in SQL99, but the latter has been there since SQL:2003.
It's pretty easy to implement the second form also, so here is a patch
that does that.
--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
0001-Clean-up-grammar-a-bit.patchtext/plain; charset=UTF-8; name=0001-Clean-up-grammar-a-bit.patch; x-mac-creator=0; x-mac-type=0Download
From e6ab38e476d65f592d8542aac68ab6a23d007668 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 19 Jun 2020 11:14:10 +0200
Subject: [PATCH 1/2] Clean up grammar a bit
Simplify the grammar specification of substring() and overlay() a bit,
simplify and update some comments.
---
src/backend/parser/gram.y | 73 ++++++++++++---------------------------
1 file changed, 23 insertions(+), 50 deletions(-)
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index e669d75a5a..1a843049f0 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -452,7 +452,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <list> extract_list overlay_list position_list
%type <list> substr_list trim_list
%type <list> opt_interval interval_second
-%type <node> overlay_placing substr_from substr_for
%type <str> unicode_normal_form
%type <boolean> opt_instead
@@ -13797,11 +13796,6 @@ func_expr_common_subexpr:
}
| OVERLAY '(' overlay_list ')'
{
- /* overlay(A PLACING B FROM C FOR D) is converted to
- * overlay(A, B, C, D)
- * overlay(A PLACING B FROM C) is converted to
- * overlay(A, B, C)
- */
$$ = (Node *) makeFuncCall(SystemFuncName("overlay"), $3, @1);
}
| POSITION '(' position_list ')'
@@ -14437,63 +14431,45 @@ unicode_normal_form:
| NFKD { $$ = "nfkd"; }
;
-/* OVERLAY() arguments
- * SQL99 defines the OVERLAY() function:
- * o overlay(text placing text from int for int)
- * o overlay(text placing text from int)
- * and similarly for binary strings
- */
+/* OVERLAY() arguments */
overlay_list:
- a_expr overlay_placing substr_from substr_for
+ a_expr PLACING a_expr FROM a_expr FOR a_expr
{
- $$ = list_make4($1, $2, $3, $4);
+ /* overlay(A PLACING B FROM C FOR D) is converted to overlay(A, B, C, D) */
+ $$ = list_make4($1, $3, $5, $7);
}
- | a_expr overlay_placing substr_from
+ | a_expr PLACING a_expr FROM a_expr
{
- $$ = list_make3($1, $2, $3);
+ /* overlay(A PLACING B FROM C) is converted to overlay(A, B, C) */
+ $$ = list_make3($1, $3, $5);
}
;
-overlay_placing:
- PLACING a_expr
- { $$ = $2; }
- ;
-
/* position_list uses b_expr not a_expr to avoid conflict with general IN */
-
position_list:
b_expr IN_P b_expr { $$ = list_make2($3, $1); }
| /*EMPTY*/ { $$ = NIL; }
;
-/* SUBSTRING() arguments
- * SQL9x defines a specific syntax for arguments to SUBSTRING():
- * o substring(text from int for int)
- * o substring(text from int) get entire string from starting point "int"
- * o substring(text for int) get first "int" characters of string
- * o substring(text from pattern) get entire string matching pattern
- * o substring(text from pattern for escape) same with specified escape char
- * We also want to support generic substring functions which accept
- * the usual generic list of arguments. So we will accept both styles
- * here, and convert the SQL9x style to the generic list for further
- * processing. - thomas 2000-11-28
- */
+/* SUBSTRING() arguments */
substr_list:
- a_expr substr_from substr_for
+ a_expr FROM a_expr FOR a_expr
{
- $$ = list_make3($1, $2, $3);
+ $$ = list_make3($1, $3, $5);
}
- | a_expr substr_for substr_from
+ | a_expr FOR a_expr FROM a_expr
{
- /* not legal per SQL99, but might as well allow it */
- $$ = list_make3($1, $3, $2);
+ /* not legal per SQL, but might as well allow it */
+ $$ = list_make3($1, $5, $3);
}
- | a_expr substr_from
+ | a_expr FROM a_expr
{
- $$ = list_make2($1, $2);
+ $$ = list_make2($1, $3);
}
- | a_expr substr_for
+ | a_expr FOR a_expr
{
+ /* not legal per SQL */
+
/*
* Since there are no cases where this syntax allows
* a textual FOR value, we forcibly cast the argument
@@ -14504,9 +14480,13 @@ substr_list:
* is unknown or doesn't have an implicit cast to int4.
*/
$$ = list_make3($1, makeIntConst(1, -1),
- makeTypeCast($2,
+ makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ /*
+ * We also want to support generic substring functions that
+ * accept the usual generic list of arguments.
+ */
| expr_list
{
$$ = $1;
@@ -14515,13 +14495,6 @@ substr_list:
{ $$ = NIL; }
;
-substr_from:
- FROM a_expr { $$ = $2; }
- ;
-
-substr_for: FOR a_expr { $$ = $2; }
- ;
-
trim_list: a_expr FROM expr_list { $$ = lappend($3, $1); }
| FROM expr_list { $$ = $2; }
| expr_list { $$ = $1; }
--
2.27.0
0002-Add-current-substring-regular-expression-syntax.patchtext/plain; charset=UTF-8; name=0002-Add-current-substring-regular-expression-syntax.patch; x-mac-creator=0; x-mac-type=0Download
From 32b6d132232d40308c47faad111e75d5c03f6b63 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 19 Jun 2020 11:14:10 +0200
Subject: [PATCH 2/2] Add current substring regular expression syntax
SQL99 had syntax
SUBSTRING(text FROM pattern FOR escapechar)
but this was replaced in SQL:2003 by the more clear
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
but this was never implemented in PostgreSQL. This patch adds that
new syntax as an alternative in the parser, and updates documentation
and tests to indicate that this is the preferred alternative now.
---
doc/src/sgml/func.sgml | 20 ++++++++++++-----
src/backend/parser/gram.y | 26 +++++++++++++++++++++-
src/test/regress/expected/strings.out | 31 ++++++++++++++++-----------
src/test/regress/sql/strings.sql | 26 +++++++++++-----------
4 files changed, 73 insertions(+), 30 deletions(-)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9d71678029..eedf189546 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2669,15 +2669,21 @@ <title><acronym>SQL</acronym> String Functions and Operators</title>
<row>
<entry role="func_table_entry"><para role="func_signature">
+ <function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>SIMILAR</literal> <parameter>pattern</parameter> <type>text</type> <literal>ESCAPE</literal> <parameter>escape</parameter> <type>text</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para role="func_signature">
<function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>FROM</literal> <parameter>pattern</parameter> <type>text</type> <literal>FOR</literal> <parameter>escape</parameter> <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Extracts substring matching <acronym>SQL</acronym> regular expression;
- see <xref linkend="functions-similarto-regexp"/>.
+ see <xref linkend="functions-similarto-regexp"/>. The first form has
+ specified since SQL:2003; the second form was only in SQL:1999 and
+ should be considered obsolete.
</para>
<para>
- <literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal>
+ <literal>substring('Thomas' similar '%#"o_a#"_' escape '#')</literal>
<returnvalue>oma</returnvalue>
</para></entry>
</row>
@@ -5160,7 +5166,11 @@ <title><function>SIMILAR TO</function> Regular Expressions</title>
The <function>substring</function> function with three parameters
provides extraction of a substring that matches an SQL
regular expression pattern. The function can be written according
- to SQL99 syntax:
+ to standard SQL syntax:
+<synopsis>
+substring(<replaceable>string</replaceable> similar <replaceable>pattern</replaceable> escape <replaceable>escape-character</replaceable>)
+</synopsis>
+ or using the now obsolete SQL99 syntax:
<synopsis>
substring(<replaceable>string</replaceable> from <replaceable>pattern</replaceable> for <replaceable>escape-character</replaceable>)
</synopsis>
@@ -5201,8 +5211,8 @@ <title><function>SIMILAR TO</function> Regular Expressions</title>
<para>
Some examples, with <literal>#"</literal> delimiting the return string:
<programlisting>
-substring('foobar' from '%#"o_b#"%' for '#') <lineannotation>oob</lineannotation>
-substring('foobar' from '#"o_b#"%' for '#') <lineannotation>NULL</lineannotation>
+substring('foobar' similar '%#"o_b#"%' escape '#') <lineannotation>oob</lineannotation>
+substring('foobar' similar '#"o_b#"%' escape '#') <lineannotation>NULL</lineannotation>
</programlisting>
</para>
</sect2>
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1a843049f0..5f21039b11 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -14451,7 +14451,27 @@ position_list:
| /*EMPTY*/ { $$ = NIL; }
;
-/* SUBSTRING() arguments */
+/*
+ * SUBSTRING() arguments
+ *
+ * Note that SQL99 has both
+ *
+ * text FROM int FOR int
+ *
+ * and
+ *
+ * text FROM pattern FOR escape
+ *
+ * In the parser we map them both to a call to the substring() function and
+ * rely on type resolution to pick the right one.
+ *
+ * In SQL:2003, the second variant was changed to
+ *
+ * text SIMILAR pattern ESCAPE escape
+ *
+ * We could in theory map that to a different function internally, but
+ * since we still support the SQL99 version, we don't.
+ */
substr_list:
a_expr FROM a_expr FOR a_expr
{
@@ -14483,6 +14503,10 @@ substr_list:
makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ | a_expr SIMILAR a_expr ESCAPE a_expr
+ {
+ $$ = list_make3($1, $3, $5);
+ }
/*
* We also want to support generic substring functions that
* accept the usual generic list of arguments.
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 6e98d183f6..8c034c9599 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
(1 row)
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+ bcd
+-----
+ bcd
+(1 row)
+
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd
-----
@@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
(1 row)
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
(1 row)
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
ERROR: SQL regular expression may not contain more than two escape-double-quote separators
CONTEXT: SQL function "substring" statement 1
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
bcdefg
--------
bcdefg
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3e89159a4f..14901a2692 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -132,31 +132,33 @@
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
-- substring() with just two arguments is not allowed by SQL spec;
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
--
2.27.0
pá 19. 6. 2020 v 11:42 odesílatel Peter Eisentraut <
peter.eisentraut@2ndquadrant.com> napsal:
At
<
https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#Obsolete_syntax_for_substring.28.29>it is described that the substring pattern matching syntax in PostgreSQL
does not conform to the current standard. PostgreSQL implementsSUBSTRING(text FROM pattern FOR escapechar)
whereas the current standard says
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
The former was in SQL99, but the latter has been there since SQL:2003.
It's pretty easy to implement the second form also, so here is a patch
that does that.
+1
Pavel
Show quoted text
--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On 6/19/20 11:42 AM, Peter Eisentraut wrote:
At
<https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#Obsolete_syntax_for_substring.28.29>
it is described that the substring pattern matching syntax in PostgreSQL
does not conform to the current standard. PostgreSQL implementsSUBSTRING(text FROM pattern FOR escapechar)
whereas the current standard says
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
The former was in SQL99, but the latter has been there since SQL:2003.
It's pretty easy to implement the second form also, so here is a patch
that does that.
Oh good, this was on my list (I added that item to the wiki).
The patches look straightforward to me. The grammar cleanup patch makes
things easier to read indeed. At first I didn't see a test left over
for the old syntax, but it's there so this is all LGTM.
Thanks for doing this!
--
Vik Fearing
Hello Peter,
whereas the current standard says
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
The former was in SQL99, but the latter has been there since SQL:2003.
It's pretty easy to implement the second form also, so here is a patch that
does that.
Patches apply cleanly, compile and "make check" is ok. doc gen is ok as
well.
Grammar cleanup is a definite improvement as it makes the grammar closer
to the actual syntax.
I cannot say I'm a fan of this kind of keywords added for some arguments.
I guess that it allows distinguishing between variants. I do not have the
standard at hand: I wanted to check whether these keywords could be
reordered, i.e. whether SUBSTRING(text ESCAPE ec SIMILAR part) was legal.
I guess not.
Maybe the doc could advertise more systematically whether a features
conforms fully or partially to some SQL standards, or is pg specific. The
added documentation refers both to SQL:1999 and SQL99. I'd suggest to
chose one, possibly the former, and use it everywhere consistently.
It seems that two instances where not updated to the new syntax, see in
./src/backend/catalog/information_schema.sql and
./contrib/citext/sql/citext.sql.
--
Fabien.
On 2020-06-20 09:08, Fabien COELHO wrote:
I cannot say I'm a fan of this kind of keywords added for some arguments.
I guess that it allows distinguishing between variants. I do not have the
standard at hand: I wanted to check whether these keywords could be
reordered, i.e. whether SUBSTRING(text ESCAPE ec SIMILAR part) was legal.
I guess not.
It is not.
Maybe the doc could advertise more systematically whether a features
conforms fully or partially to some SQL standards, or is pg specific.
I think that would be useful, but it's probably a broader topic than
just for this specific function.
The
added documentation refers both to SQL:1999 and SQL99. I'd suggest to
chose one, possibly the former, and use it everywhere consistently.
fixed
It seems that two instances where not updated to the new syntax, see in
./src/backend/catalog/information_schema.sql and
./contrib/citext/sql/citext.sql.
done
--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
v2-0001-Clean-up-grammar-a-bit.patchtext/plain; charset=UTF-8; name=v2-0001-Clean-up-grammar-a-bit.patch; x-mac-creator=0; x-mac-type=0Download
From 38a4f5bb3bc74ef033dd141031cbc89450e3bd06 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 19 Jun 2020 11:14:10 +0200
Subject: [PATCH v2 1/2] Clean up grammar a bit
Simplify the grammar specification of substring() and overlay() a bit,
simplify and update some comments.
Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com
---
src/backend/parser/gram.y | 73 ++++++++++++---------------------------
1 file changed, 23 insertions(+), 50 deletions(-)
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index e669d75a5a..1a843049f0 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -452,7 +452,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <list> extract_list overlay_list position_list
%type <list> substr_list trim_list
%type <list> opt_interval interval_second
-%type <node> overlay_placing substr_from substr_for
%type <str> unicode_normal_form
%type <boolean> opt_instead
@@ -13797,11 +13796,6 @@ func_expr_common_subexpr:
}
| OVERLAY '(' overlay_list ')'
{
- /* overlay(A PLACING B FROM C FOR D) is converted to
- * overlay(A, B, C, D)
- * overlay(A PLACING B FROM C) is converted to
- * overlay(A, B, C)
- */
$$ = (Node *) makeFuncCall(SystemFuncName("overlay"), $3, @1);
}
| POSITION '(' position_list ')'
@@ -14437,63 +14431,45 @@ unicode_normal_form:
| NFKD { $$ = "nfkd"; }
;
-/* OVERLAY() arguments
- * SQL99 defines the OVERLAY() function:
- * o overlay(text placing text from int for int)
- * o overlay(text placing text from int)
- * and similarly for binary strings
- */
+/* OVERLAY() arguments */
overlay_list:
- a_expr overlay_placing substr_from substr_for
+ a_expr PLACING a_expr FROM a_expr FOR a_expr
{
- $$ = list_make4($1, $2, $3, $4);
+ /* overlay(A PLACING B FROM C FOR D) is converted to overlay(A, B, C, D) */
+ $$ = list_make4($1, $3, $5, $7);
}
- | a_expr overlay_placing substr_from
+ | a_expr PLACING a_expr FROM a_expr
{
- $$ = list_make3($1, $2, $3);
+ /* overlay(A PLACING B FROM C) is converted to overlay(A, B, C) */
+ $$ = list_make3($1, $3, $5);
}
;
-overlay_placing:
- PLACING a_expr
- { $$ = $2; }
- ;
-
/* position_list uses b_expr not a_expr to avoid conflict with general IN */
-
position_list:
b_expr IN_P b_expr { $$ = list_make2($3, $1); }
| /*EMPTY*/ { $$ = NIL; }
;
-/* SUBSTRING() arguments
- * SQL9x defines a specific syntax for arguments to SUBSTRING():
- * o substring(text from int for int)
- * o substring(text from int) get entire string from starting point "int"
- * o substring(text for int) get first "int" characters of string
- * o substring(text from pattern) get entire string matching pattern
- * o substring(text from pattern for escape) same with specified escape char
- * We also want to support generic substring functions which accept
- * the usual generic list of arguments. So we will accept both styles
- * here, and convert the SQL9x style to the generic list for further
- * processing. - thomas 2000-11-28
- */
+/* SUBSTRING() arguments */
substr_list:
- a_expr substr_from substr_for
+ a_expr FROM a_expr FOR a_expr
{
- $$ = list_make3($1, $2, $3);
+ $$ = list_make3($1, $3, $5);
}
- | a_expr substr_for substr_from
+ | a_expr FOR a_expr FROM a_expr
{
- /* not legal per SQL99, but might as well allow it */
- $$ = list_make3($1, $3, $2);
+ /* not legal per SQL, but might as well allow it */
+ $$ = list_make3($1, $5, $3);
}
- | a_expr substr_from
+ | a_expr FROM a_expr
{
- $$ = list_make2($1, $2);
+ $$ = list_make2($1, $3);
}
- | a_expr substr_for
+ | a_expr FOR a_expr
{
+ /* not legal per SQL */
+
/*
* Since there are no cases where this syntax allows
* a textual FOR value, we forcibly cast the argument
@@ -14504,9 +14480,13 @@ substr_list:
* is unknown or doesn't have an implicit cast to int4.
*/
$$ = list_make3($1, makeIntConst(1, -1),
- makeTypeCast($2,
+ makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ /*
+ * We also want to support generic substring functions that
+ * accept the usual generic list of arguments.
+ */
| expr_list
{
$$ = $1;
@@ -14515,13 +14495,6 @@ substr_list:
{ $$ = NIL; }
;
-substr_from:
- FROM a_expr { $$ = $2; }
- ;
-
-substr_for: FOR a_expr { $$ = $2; }
- ;
-
trim_list: a_expr FROM expr_list { $$ = lappend($3, $1); }
| FROM expr_list { $$ = $2; }
| expr_list { $$ = $1; }
--
2.27.0
v2-0002-Add-current-substring-regular-expression-syntax.patchtext/plain; charset=UTF-8; name=v2-0002-Add-current-substring-regular-expression-syntax.patch; x-mac-creator=0; x-mac-type=0Download
From 50aa82f006d9f22a724c94ebdedcc711fbb600af Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sat, 27 Jun 2020 11:05:10 +0200
Subject: [PATCH v2 2/2] Add current substring regular expression syntax
SQL:1999 had syntax
SUBSTRING(text FROM pattern FOR escapechar)
but this was replaced in SQL:2003 by the more clear
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
but this was never implemented in PostgreSQL. This patch adds that
new syntax as an alternative in the parser, and updates documentation
and tests to indicate that this is the preferred alternative now.
Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com
---
contrib/citext/expected/citext.out | 2 +-
contrib/citext/expected/citext_1.out | 2 +-
contrib/citext/sql/citext.sql | 2 +-
doc/src/sgml/func.sgml | 20 ++++++++++----
src/backend/catalog/information_schema.sql | 2 +-
src/backend/parser/gram.y | 26 +++++++++++++++++-
src/test/regress/expected/strings.out | 31 +++++++++++++---------
src/test/regress/sql/strings.sql | 26 +++++++++---------
8 files changed, 77 insertions(+), 34 deletions(-)
diff --git a/contrib/citext/expected/citext.out b/contrib/citext/expected/citext.out
index 96800be9c0..ec99aaed5d 100644
--- a/contrib/citext/expected/citext.out
+++ b/contrib/citext/expected/citext.out
@@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
t
(1 row)
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
t
---
t
diff --git a/contrib/citext/expected/citext_1.out b/contrib/citext/expected/citext_1.out
index 33e3676d3c..75fd08b7cc 100644
--- a/contrib/citext/expected/citext_1.out
+++ b/contrib/citext/expected/citext_1.out
@@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
t
(1 row)
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
t
---
t
diff --git a/contrib/citext/sql/citext.sql b/contrib/citext/sql/citext.sql
index 261b73cfa6..10232f5a9f 100644
--- a/contrib/citext/sql/citext.sql
+++ b/contrib/citext/sql/citext.sql
@@ -564,7 +564,7 @@ CREATE TABLE caster (
SELECT substring('Thomas'::citext from 2 for 3) = 'hom' AS t;
SELECT substring('Thomas'::citext from 2) = 'homas' AS t;
SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
SELECT trim(' trim '::citext) = 'trim' AS t;
SELECT trim('xxxxxtrimxxxx'::citext, 'x'::citext) = 'trim' AS t;
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7119f0b2ca..af4179b311 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2669,15 +2669,21 @@ <title><acronym>SQL</acronym> String Functions and Operators</title>
<row>
<entry role="func_table_entry"><para role="func_signature">
+ <function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>SIMILAR</literal> <parameter>pattern</parameter> <type>text</type> <literal>ESCAPE</literal> <parameter>escape</parameter> <type>text</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para role="func_signature">
<function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>FROM</literal> <parameter>pattern</parameter> <type>text</type> <literal>FOR</literal> <parameter>escape</parameter> <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Extracts substring matching <acronym>SQL</acronym> regular expression;
- see <xref linkend="functions-similarto-regexp"/>.
+ see <xref linkend="functions-similarto-regexp"/>. The first form has
+ specified since SQL:2003; the second form was only in SQL:1999 and
+ should be considered obsolete.
</para>
<para>
- <literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal>
+ <literal>substring('Thomas' similar '%#"o_a#"_' escape '#')</literal>
<returnvalue>oma</returnvalue>
</para></entry>
</row>
@@ -5160,7 +5166,11 @@ <title><function>SIMILAR TO</function> Regular Expressions</title>
The <function>substring</function> function with three parameters
provides extraction of a substring that matches an SQL
regular expression pattern. The function can be written according
- to SQL99 syntax:
+ to standard SQL syntax:
+<synopsis>
+substring(<replaceable>string</replaceable> similar <replaceable>pattern</replaceable> escape <replaceable>escape-character</replaceable>)
+</synopsis>
+ or using the now obsolete SQL:1999 syntax:
<synopsis>
substring(<replaceable>string</replaceable> from <replaceable>pattern</replaceable> for <replaceable>escape-character</replaceable>)
</synopsis>
@@ -5201,8 +5211,8 @@ <title><function>SIMILAR TO</function> Regular Expressions</title>
<para>
Some examples, with <literal>#"</literal> delimiting the return string:
<programlisting>
-substring('foobar' from '%#"o_b#"%' for '#') <lineannotation>oob</lineannotation>
-substring('foobar' from '#"o_b#"%' for '#') <lineannotation>NULL</lineannotation>
+substring('foobar' similar '%#"o_b#"%' escape '#') <lineannotation>oob</lineannotation>
+substring('foobar' similar '#"o_b#"%' escape '#') <lineannotation>NULL</lineannotation>
</programlisting>
</para>
</sect2>
diff --git a/src/backend/catalog/information_schema.sql b/src/backend/catalog/information_schema.sql
index 3e07fb107e..5ab47e7743 100644
--- a/src/backend/catalog/information_schema.sql
+++ b/src/backend/catalog/information_schema.sql
@@ -182,7 +182,7 @@ CREATE FUNCTION _pg_interval_type(typid oid, mod int4) RETURNS text
AS
$$SELECT
CASE WHEN $1 IN (1186) /* interval */
- THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) from 'interval[()0-9]* #"%#"' for '#'))
+ THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) similar 'interval[()0-9]* #"%#"' escape '#'))
ELSE null
END$$;
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1a843049f0..4ff35095b8 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -14451,7 +14451,27 @@ position_list:
| /*EMPTY*/ { $$ = NIL; }
;
-/* SUBSTRING() arguments */
+/*
+ * SUBSTRING() arguments
+ *
+ * Note that SQL:1999 has both
+ *
+ * text FROM int FOR int
+ *
+ * and
+ *
+ * text FROM pattern FOR escape
+ *
+ * In the parser we map them both to a call to the substring() function and
+ * rely on type resolution to pick the right one.
+ *
+ * In SQL:2003, the second variant was changed to
+ *
+ * text SIMILAR pattern ESCAPE escape
+ *
+ * We could in theory map that to a different function internally, but
+ * since we still support the SQL:1999 version, we don't.
+ */
substr_list:
a_expr FROM a_expr FOR a_expr
{
@@ -14483,6 +14503,10 @@ substr_list:
makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ | a_expr SIMILAR a_expr ESCAPE a_expr
+ {
+ $$ = list_make3($1, $3, $5);
+ }
/*
* We also want to support generic substring functions that
* accept the usual generic list of arguments.
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 6e98d183f6..8c034c9599 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
(1 row)
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+ bcd
+-----
+ bcd
+(1 row)
+
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd
-----
@@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
(1 row)
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
(1 row)
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
ERROR: SQL regular expression may not contain more than two escape-double-quote separators
CONTEXT: SQL function "substring" statement 1
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
bcdefg
--------
bcdefg
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3e89159a4f..14901a2692 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -132,31 +132,33 @@
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
-- substring() with just two arguments is not allowed by SQL spec;
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
--
2.27.0
Hallo Peter,
v2 patches apply cleanly, compile, global check ok, citext check ok, doc
gen ok. No further comments.
As I did not find an entry in the CF, so I did nothing about tagging it
"ready".
--
Fabien.
On 2020-06-28 08:13, Fabien COELHO wrote:
v2 patches apply cleanly, compile, global check ok, citext check ok, doc
gen ok. No further comments.
committed, thanks
As I did not find an entry in the CF, so I did nothing about tagging it
"ready".
Right, I had not registered it yet.
--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services