From e0d49c4e81054e5c66be1f78cb57ad2ae87fa02d Mon Sep 17 00:00:00 2001
From: Karina Litskevich <litskevichkarina@gmail.com>
Date: Wed, 28 Aug 2024 11:52:25 +0300
Subject: [PATCH v2] Improve error message for rejecting trailing junk after
 numeric literals

Rejecting trailing junk after numeric literals was introduced in
commit 2549f066 to prevent scanning a number immediately followed
by an identifier without whitespace as number and identifier.
Unfortunately, all the tokens made to catch such numeric literals
followed by non-digits match a numeric literal and the next byte. The
lexemes found by these tokens are broken in case the next symbol after
a numeric literal is presented by several bytes as only the first byte
of the symbol gets to the lexeme. When this lexeme is then printed as
a part of an error message that message became broken too along with
the whole log file where it goes.

This commit fixes the problem by using tokens that match a numeric
literal immediately followed by an identifier, not only one byte. This
also improves error messages in cases with English letters. For 123abc
the error message now will say that the error appeared at or near
"123abc" instead of "123a".

Since anything that matches hexinteger, octinteger or bininteger
followed by identifier also matches decinteger followed by identifier,
use one common token for that.
---
 src/backend/parser/scan.l                | 25 +++++-------------------
 src/fe_utils/psqlscan.l                  | 22 +++++----------------
 src/interfaces/ecpg/preproc/pgc.l        | 22 +++++----------------
 src/test/regress/expected/numerology.out | 14 ++++++++-----
 src/test/regress/sql/numerology.sql      |  1 +
 5 files changed, 25 insertions(+), 59 deletions(-)

diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f74059e7b0..028f938789 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -412,16 +412,13 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
 
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+param_junk		\${decdigit}+{identifier}
 
 other			.
 
@@ -1055,19 +1052,7 @@ other			.
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
-{decinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{hexinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{octinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{bininteger_junk}	{
+{integer_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index ddc4658b92..70b6561548 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -348,16 +348,13 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
 
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+param_junk		\${decdigit}+{identifier}
 
 /* psql-specific: characters allowed in variable names */
 variable_char	[A-Za-z\200-\377_0-9]
@@ -898,16 +895,7 @@ other			.
 {realfail}		{
 					ECHO;
 				}
-{decinteger_junk}	{
-					ECHO;
-				}
-{hexinteger_junk}	{
-					ECHO;
-				}
-{octinteger_junk}	{
-					ECHO;
-				}
-{bininteger_junk}	{
+{integer_junk}	{
 					ECHO;
 				}
 {numeric_junk}	{
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index f363a34659..064af29b4d 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -381,16 +381,13 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
 
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+param_junk		\${decdigit}+{identifier}
 
 /* special characters for other dbms */
 /* we have to react differently in compat mode */
@@ -1023,16 +1020,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 	 * Note that some trailing junk is valid in C (such as 100LL), so we
 	 * contain this to SQL mode.
 	 */
-{decinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{hexinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{octinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{bininteger_junk} {
+{integer_junk}	{
 					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
 				}
 {numeric_junk}	{
diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out
index 717a237df9..f4bd6fbfee 100644
--- a/src/test/regress/expected/numerology.out
+++ b/src/test/regress/expected/numerology.out
@@ -171,9 +171,13 @@ SELECT -0x8000000000000001;
 
 -- error cases
 SELECT 123abc;
-ERROR:  trailing junk after numeric literal at or near "123a"
+ERROR:  trailing junk after numeric literal at or near "123abc"
 LINE 1: SELECT 123abc;
                ^
+SELECT 1ä;
+ERROR:  trailing junk after numeric literal at or near "1ä"
+LINE 1: SELECT 1ä;
+               ^
 SELECT 0x0o;
 ERROR:  trailing junk after numeric literal at or near "0x0o"
 LINE 1: SELECT 0x0o;
@@ -322,7 +326,7 @@ ERROR:  trailing junk after numeric literal at or near "100_"
 LINE 1: SELECT 100_;
                ^
 SELECT 100__000;
-ERROR:  trailing junk after numeric literal at or near "100_"
+ERROR:  trailing junk after numeric literal at or near "100__000"
 LINE 1: SELECT 100__000;
                ^
 SELECT _1_000.5;
@@ -334,7 +338,7 @@ ERROR:  trailing junk after numeric literal at or near "1_000_"
 LINE 1: SELECT 1_000_.5;
                ^
 SELECT 1_000._5;
-ERROR:  trailing junk after numeric literal at or near "1_000._"
+ERROR:  trailing junk after numeric literal at or near "1_000._5"
 LINE 1: SELECT 1_000._5;
                ^
 SELECT 1_000.5_;
@@ -342,11 +346,11 @@ ERROR:  trailing junk after numeric literal at or near "1_000.5_"
 LINE 1: SELECT 1_000.5_;
                ^
 SELECT 1_000.5e_1;
-ERROR:  trailing junk after numeric literal at or near "1_000.5e"
+ERROR:  trailing junk after numeric literal at or near "1_000.5e_1"
 LINE 1: SELECT 1_000.5e_1;
                ^
 PREPARE p1 AS SELECT $0_1;
-ERROR:  trailing junk after parameter at or near "$0_"
+ERROR:  trailing junk after parameter at or near "$0_1"
 LINE 1: PREPARE p1 AS SELECT $0_1;
                              ^
 --
diff --git a/src/test/regress/sql/numerology.sql b/src/test/regress/sql/numerology.sql
index 3ae491cc98..d1a42b046c 100644
--- a/src/test/regress/sql/numerology.sql
+++ b/src/test/regress/sql/numerology.sql
@@ -44,6 +44,7 @@ SELECT -0x8000000000000001;
 
 -- error cases
 SELECT 123abc;
+SELECT 1ä;
 SELECT 0x0o;
 SELECT 0.a;
 SELECT 0.0a;
-- 
2.34.1

