From 2387de10cf241f86f6987ba310f59594cae6b64f Mon Sep 17 00:00:00 2001
From: Nikita Glukhov <n.gluhov@postgrespro.ru>
Date: Fri, 22 Mar 2019 15:15:38 +0300
Subject: [PATCH] Fix parsing of identifiers in jsonpath

---
 src/backend/utils/adt/jsonpath.c       |  11 ++-
 src/backend/utils/adt/jsonpath_gram.y  |   6 +-
 src/backend/utils/adt/jsonpath_scan.l  | 125 +++++++++++++++--------------
 src/test/regress/expected/jsonpath.out | 138 +++++++++++++++++++++++++++++----
 src/test/regress/sql/jsonpath.sql      |  21 +++++
 5 files changed, 216 insertions(+), 85 deletions(-)

diff --git a/src/backend/utils/adt/jsonpath.c b/src/backend/utils/adt/jsonpath.c
index 7f32248..f43aeef 100644
--- a/src/backend/utils/adt/jsonpath.c
+++ b/src/backend/utils/adt/jsonpath.c
@@ -494,9 +494,14 @@ printJsonPathItem(StringInfo buf, JsonPathItem *v, bool inKey,
 			escape_json(buf, jspGetString(v, NULL));
 			break;
 		case jpiVariable:
-			appendStringInfoChar(buf, '$');
-			escape_json(buf, jspGetString(v, NULL));
-			break;
+			{
+				int32		len;
+				char	   *name = jspGetString(v, &len);
+
+				appendStringInfoChar(buf, '$');
+				appendBinaryStringInfo(buf, name, len);
+				break;
+			}
 		case jpiNumeric:
 			appendStringInfoString(buf,
 								   DatumGetCString(DirectFunctionCall1(numeric_out,
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index 1725502..196a191 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -334,8 +334,10 @@ makeItemVariable(JsonPathString *s)
 	JsonPathParseItem  *v;
 
 	v = makeItemType(jpiVariable);
-	v->value.string.val = s->val;
-	v->value.string.len = s->len;
+
+	/* skip leading '$' */
+	v->value.string.val = &s->val[1];
+	v->value.string.len = s->len - 1;
 
 	return v;
 }
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index 2165ffc..2c79bf0 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -20,6 +20,8 @@
 #include "mb/pg_wchar.h"
 #include "nodes/pg_list.h"
 
+#define JSONPATH_SPECIAL_CHARS "?%.[]{}()|&!=<>@#,*:-+/~`;\\\"' \b\f\n\r\t\v"
+
 static JsonPathString scanstring;
 
 /* Handles to the buffer that the lexer uses internally */
@@ -63,7 +65,7 @@ fprintf_to_ereport(const char *fmt, const char *msg)
  * quoted variable names and C-tyle comments.
  * Exclusive states:
  *  <xq> - quoted strings
- *  <xnq> - non-quoted strings
+ *  <xnq> - non-quoted identifiers
  *  <xvq> - quoted variable names
  *  <xsq> - single-quoted strings
  *  <xc> - C-style comment
@@ -75,9 +77,12 @@ fprintf_to_ereport(const char *fmt, const char *msg)
 %x xsq
 %x xc
 
-special		 [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
-any			[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
-blank		[ \t\n\r\f]
+special		 [\?\%\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\~\`\;]
+id_start	[^\?\%\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\~\`\;\\\"\' \b\f\n\r\t\v(0-9)]
+blank		[ \b\f\n\r\t\v]
+
+id_cont		({id_start}|[0-9])
+id			{id_start}{id_cont}*
 
 digit		[0-9]
 integer		(0|[1-9]{digit}*)
@@ -95,68 +100,60 @@ hex_fail	\\x{hex_dig}{0,1}
 
 %%
 
-<xnq>{any}+						{
-									addstring(false, yytext, yyleng);
-								}
+<xq,xvq,xsq>\\[\"\'\\]			{ addchar(false, yytext[1]); }
 
-<xnq>{blank}+					{
-									yylval->str = scanstring;
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\b					{ addchar(false, '\b'); }
 
+<xq,xvq,xsq>\\f					{ addchar(false, '\f'); }
 
-<xnq>\/\*						{
-									yylval->str = scanstring;
-									BEGIN xc;
-								}
+<xq,xvq,xsq>\\n					{ addchar(false, '\n'); }
 
-<xnq>({special}|\"|\')			{
-									yylval->str = scanstring;
-									yyless(0);
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\r					{ addchar(false, '\r'); }
 
-<xnq><<EOF>>					{
-									yylval->str = scanstring;
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\t					{ addchar(false, '\t'); }
 
-<xnq,xq,xvq,xsq>\\[\"\'\\]		{ addchar(false, yytext[1]); }
+<xq,xvq,xsq>\\v					{ addchar(false, '\v'); }
 
-<xnq,xq,xvq,xsq>\\b				{ addchar(false, '\b'); }
+<xq,xnq,xvq,xsq>{unicode}+		{ parseUnicode(yytext, yyleng); }
 
-<xnq,xq,xvq,xsq>\\f				{ addchar(false, '\f'); }
+<xq,xvq,xsq>{hex_char}			{ parseHexChar(yytext); }
 
-<xnq,xq,xvq,xsq>\\n				{ addchar(false, '\n'); }
+<xq,xvq,xsq>{hex_fail}			{ yyerror(NULL, "invalid hex character sequence"); }
 
-<xnq,xq,xvq,xsq>\\r				{ addchar(false, '\r'); }
+<INITIAL,xq,xnq,xvq,xsq>{unicode}*{unicodefail}	{
+									yyerror(NULL, "invalid unicode sequence");
+								}
 
-<xnq,xq,xvq,xsq>\\t				{ addchar(false, '\t'); }
+<INITIAL,xq,xnq,xvq,xsq>{unicode}+\\ {
+									/* throw back the \\, and treat as unicode */
+									yyless(yyleng - 1);
+									parseUnicode(yytext, yyleng);
+								}
 
-<xnq,xq,xvq,xsq>\\v				{ addchar(false, '\v'); }
+<xq,xnq,xvq,xsq>\\.				{ yyerror(NULL, "escape sequence is invalid"); }
 
-<xnq,xq,xvq,xsq>{unicode}+		{ parseUnicode(yytext, yyleng); }
+<xq,xnq,xvq,xsq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
 
-<xnq,xq,xvq,xsq>{hex_char}		{ parseHexChar(yytext); }
+<xq,xvq,xsq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
 
-<xnq,xq,xvq,xsq>{unicode}*{unicodefail}	{ yyerror(NULL, "invalid unicode sequence"); }
 
-<xnq,xq,xvq,xsq>{hex_fail}		{ yyerror(NULL, "invalid hex character sequence"); }
+<xnq>{id_cont}+					{ addstring(false, yytext, yyleng); }
 
-<xnq,xq,xvq,xsq>{unicode}+\\	{
-									/* throw back the \\, and treat as unicode */
+<xnq>({special}|{blank}|[\'\"])	{
+									/* throw back the special symbol and return id */
 									yyless(yyleng - 1);
-									parseUnicode(yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
 								}
 
-<xnq,xq,xvq,xsq>\\.				{ yyerror(NULL, "escape sequence is invalid"); }
-
-<xnq,xq,xvq,xsq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
-
-<xq,xvq,xsq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
+<xnq><<EOF>>					{
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
 
 <xq>\"							{
 									yylval->str = scanstring;
@@ -210,18 +207,6 @@ hex_fail	\\x{hex_dig}{0,1}
 
 \>								{ return GREATER_P; }
 
-\${any}+						{
-									addstring(true, yytext + 1, yyleng - 1);
-									addchar(false, '\0');
-									yylval->str = scanstring;
-									return VARIABLE_P;
-								}
-
-\$\"							{
-									addchar(true, '\0');
-									BEGIN xvq;
-								}
-
 {special}						{ return *yytext; }
 
 {blank}+						{ /* ignore */ }
@@ -263,11 +248,18 @@ hex_fail	\\x{hex_dig}{0,1}
 
 ({realfail1}|{realfail2})		{ yyerror(NULL, "invalid floating point number"); }
 
-{any}+							{
+
+{id_start}{id_cont}*			{
 									addstring(true, yytext, yyleng);
 									BEGIN xnq;
 								}
 
+{unicode}+						{
+									addstring(true, "", 0);
+									parseUnicode(yytext, yyleng);
+									BEGIN xnq;
+								}
+
 \"								{
 									addchar(true, '\0');
 									BEGIN xq;
@@ -278,11 +270,7 @@ hex_fail	\\x{hex_dig}{0,1}
 									BEGIN xsq;
 								}
 
-\\								{
-									yyless(0);
-									addchar(true, '\0');
-									BEGIN xnq;
-								}
+\\								{ yyerror(NULL, "invalid escape sequence"); }
 
 <<EOF>>							{ yyterminate(); }
 
@@ -354,6 +342,12 @@ checkKeyword()
 						   *StopHigh = keywords + lengthof(keywords),
 						   *StopMiddle;
 
+	if (strcspn(scanstring.val, JSONPATH_SPECIAL_CHARS) < scanstring.len)
+		jsonpath_yyerror(NULL, "invalid characters in identifier");
+
+	if (scanstring.val[0] == '$')
+		return scanstring.len == 1 ? '$' : VARIABLE_P;
+
 	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
 		return res;
 
@@ -604,6 +598,9 @@ parseUnicode(char *s, int l)
 			while (s[++i] != '}' && i < l)
 				ch = (ch << 4) | hexval(s[i]);
 			i++;	/* skip '}' */
+
+			if (ch > 0x10FFFF)
+				jsonpath_yyerror(NULL, "invalid Unicode escape value");
 		}
 		else		/* parse '\uXXXX' */
 		{
diff --git a/src/test/regress/expected/jsonpath.out b/src/test/regress/expected/jsonpath.out
index ea42ae3..14221d4 100644
--- a/src/test/regress/expected/jsonpath.out
+++ b/src/test/regress/expected/jsonpath.out
@@ -190,11 +190,75 @@ select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
 (1 row)
 
 select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
+ERROR:  escape sequence is invalid at or near "\x" of jsonpath input
+LINE 1: select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::json...
+               ^
+select '$."foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar"'::jsonpath;
       jsonpath       
 ---------------------
  $."fooPgSQL\t\"bar"
 (1 row)
 
+select '$.\u12345.a\u1234.b\u{12}34'::jsonpath;
+        jsonpath         
+-------------------------
+ $."á´5"."aá´"."b\u001234"
+(1 row)
+
+select '$.\u123'::jsonpath;
+ERROR:  invalid unicode sequence at or near "\u123" of jsonpath input
+LINE 1: select '$.\u123'::jsonpath;
+               ^
+select '$.\u{}'::jsonpath;
+ERROR:  invalid unicode sequence at or near "\u{" of jsonpath input
+LINE 1: select '$.\u{}'::jsonpath;
+               ^
+select '$.\u{1}'::jsonpath;
+  jsonpath  
+------------
+ $."\u0001"
+(1 row)
+
+select '$.\u{20}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$.\u{20}'::jsonpath;
+               ^
+select '$."\u{20}"'::jsonpath;
+ jsonpath 
+----------
+ $." "
+(1 row)
+
+select '$.\u{21}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$.\u{21}'::jsonpath;
+               ^
+select '$."\u{21}"'::jsonpath;
+ jsonpath 
+----------
+ $."!"
+(1 row)
+
+select '$.\u{12345}'::jsonpath;
+ jsonpath 
+----------
+ $."ð"
+(1 row)
+
+select '$.\u{10FFFF}'::jsonpath;
+ jsonpath 
+----------
+ $.""
+(1 row)
+
+select '$.\u{110000}'::jsonpath;
+ERROR:  invalid Unicode escape value at or near "\u{110000}" of jsonpath input
+LINE 1: select '$.\u{110000}'::jsonpath;
+               ^
+select '$.\x123'::jsonpath;
+ERROR:  invalid escape sequence at or near "\" of jsonpath input
+LINE 1: select '$.\x123'::jsonpath;
+               ^
 select '$.g ? ($.a == 1)'::jsonpath;
       jsonpath      
 --------------------
@@ -282,25 +346,59 @@ select '$.g ? (+@.x >= +-(+@.a + 2))'::jsonpath;
 select '$a'::jsonpath;
  jsonpath 
 ----------
- $"a"
+ $a
+(1 row)
+
+select '$_'::jsonpath;
+ jsonpath 
+----------
+ $_
+(1 row)
+
+select '$123'::jsonpath;
+ jsonpath 
+----------
+ $123
+(1 row)
+
+select '$$$'::jsonpath;
+ jsonpath 
+----------
+ $$$
+(1 row)
+
+select '$_$$1_3a'::jsonpath;
+ jsonpath 
+----------
+ $_$$1_3a
+(1 row)
+
+select '$\u12345'::jsonpath;
+ jsonpath 
+----------
+ $á´5
 (1 row)
 
+select '$\u{20}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$\u{20}'::jsonpath;
+               ^
 select '$a.b'::jsonpath;
  jsonpath 
 ----------
- $"a"."b"
+ $a."b"
 (1 row)
 
 select '$a[*]'::jsonpath;
  jsonpath 
 ----------
- $"a"[*]
+ $a[*]
 (1 row)
 
 select '$.g ? (@.zip == $zip)'::jsonpath;
-         jsonpath          
----------------------------
- $."g"?(@."zip" == $"zip")
+        jsonpath         
+-------------------------
+ $."g"?(@."zip" == $zip)
 (1 row)
 
 select '$.a[1,2, 3 to 16]'::jsonpath;
@@ -310,9 +408,9 @@ select '$.a[1,2, 3 to 16]'::jsonpath;
 (1 row)
 
 select '$.a[$a + 1, ($b[*]) to -($[0] * 2)]'::jsonpath;
-                jsonpath                
-----------------------------------------
- $."a"[$"a" + 1,$"b"[*] to -($[0] * 2)]
+              jsonpath              
+------------------------------------
+ $."a"[$a + 1,$b[*] to -($[0] * 2)]
 (1 row)
 
 select '$.a[$.a.size() - 3]'::jsonpath;
@@ -408,9 +506,9 @@ select '$ ? (@ starts with "abc")'::jsonpath;
 (1 row)
 
 select '$ ? (@ starts with $var)'::jsonpath;
-         jsonpath         
---------------------------
- $?(@ starts with $"var")
+        jsonpath        
+------------------------
+ $?(@ starts with $var)
 (1 row)
 
 select '$ ? (@ like_regex "(invalid pattern")'::jsonpath;
@@ -481,9 +579,9 @@ select '$ < 1'::jsonpath;
 (1 row)
 
 select '($ < 1) || $.a.b <= $x'::jsonpath;
-           jsonpath           
-------------------------------
- ($ < 1 || $."a"."b" <= $"x")
+          jsonpath          
+----------------------------
+ ($ < 1 || $."a"."b" <= $x)
 (1 row)
 
 select '@ + 1'::jsonpath;
@@ -815,9 +913,17 @@ select '0'::jsonpath;
 (1 row)
 
 select '00'::jsonpath;
-ERROR:  syntax error, unexpected IDENT_P at end of jsonpath input
+ERROR:  syntax error, unexpected INT_P, expecting $end at or near "0" of jsonpath input
 LINE 1: select '00'::jsonpath;
                ^
+select '$.00'::jsonpath;
+ERROR:  syntax error, unexpected INT_P at or near "0" of jsonpath input
+LINE 1: select '$.00'::jsonpath;
+               ^
+select '$.0a'::jsonpath;
+ERROR:  syntax error, unexpected INT_P at or near "0" of jsonpath input
+LINE 1: select '$.0a'::jsonpath;
+               ^
 select '0.0'::jsonpath;
  jsonpath 
 ----------
diff --git a/src/test/regress/sql/jsonpath.sql b/src/test/regress/sql/jsonpath.sql
index 29ea77a..9e44f1d 100644
--- a/src/test/regress/sql/jsonpath.sql
+++ b/src/test/regress/sql/jsonpath.sql
@@ -34,6 +34,19 @@ select '''\b\f\r\n\t\v\"\''\\'''::jsonpath;
 select '"\x50\u0067\u{53}\u{051}\u{00004C}"'::jsonpath;
 select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
 select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
+select '$."foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar"'::jsonpath;
+select '$.\u12345.a\u1234.b\u{12}34'::jsonpath;
+select '$.\u123'::jsonpath;
+select '$.\u{}'::jsonpath;
+select '$.\u{1}'::jsonpath;
+select '$.\u{20}'::jsonpath;
+select '$."\u{20}"'::jsonpath;
+select '$.\u{21}'::jsonpath;
+select '$."\u{21}"'::jsonpath;
+select '$.\u{12345}'::jsonpath;
+select '$.\u{10FFFF}'::jsonpath;
+select '$.\u{110000}'::jsonpath;
+select '$.\x123'::jsonpath;
 
 select '$.g ? ($.a == 1)'::jsonpath;
 select '$.g ? (@ == 1)'::jsonpath;
@@ -51,6 +64,12 @@ select '$.g ? ((@.x >= 123 || @.a == 4) && exists (@.x ? (@ == 14)))'::jsonpath;
 select '$.g ? (+@.x >= +-(+@.a + 2))'::jsonpath;
 
 select '$a'::jsonpath;
+select '$_'::jsonpath;
+select '$123'::jsonpath;
+select '$$$'::jsonpath;
+select '$_$$1_3a'::jsonpath;
+select '$\u12345'::jsonpath;
+select '$\u{20}'::jsonpath;
 select '$a.b'::jsonpath;
 select '$a[*]'::jsonpath;
 select '$.g ? (@.zip == $zip)'::jsonpath;
@@ -153,6 +172,8 @@ select '$ ? (@.a < +10.1e+1)'::jsonpath;
 
 select '0'::jsonpath;
 select '00'::jsonpath;
+select '$.00'::jsonpath;
+select '$.0a'::jsonpath;
 select '0.0'::jsonpath;
 select '0.000'::jsonpath;
 select '0.000e1'::jsonpath;
-- 
2.7.4

