From 5ea5886fb44e8bc85753400ea4b1375daf8b2d2d Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 12 Jul 2019 13:16:44 +0700
Subject: [PATCH v6 1/2] Reduce the number of states in the core scanner table

Previously, the core scanner had 37045 states, which required Flex
to use 32-bit types in the yy_transition array. Refactor the Flex
rules to reduce the number of states to 22331. With 16-bit types,
this shrinks the backend binary by about 200kB.

1. When Flex encounters a quote while inside any kind of quoted
string, it saves the current start condition and enters a new one in
order to detect possible string continuations.

2. Unify xusend and xuiend into a single start condition to detect
a possible UESCAPE. If one is found, enter a new start condition to
scan the escape character.

Sync psql and ECPG scanners to match.
---
 src/backend/parser/scan.l           | 265 ++++++++++++++++------------
 src/fe_utils/psqlscan.l             | 169 ++++++++++--------
 src/include/fe_utils/psqlscan_int.h |   1 +
 src/include/parser/scanner.h        |   1 +
 src/interfaces/ecpg/preproc/pgc.l   | 263 +++++++++++++++++++--------
 5 files changed, 436 insertions(+), 263 deletions(-)
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index e1cae859e8..d2ccb438f6 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -168,12 +168,14 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xd> delimited identifiers (double-quoted identifiers)
  *  <xh> hexadecimal numeric string
  *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
- *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
  *  <xus> quoted string with Unicode escapes
- *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
+ *  <xuend> end of a quoted string or identifier with Unicode escapes,
+ *    UESCAPE can follow
+ *  <xuchar> expecting escape character literal after UESCAPE
  *  <xeu> Unicode surrogate pair in extended quoted string
  *
  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
@@ -185,12 +187,13 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 %x xd
 %x xh
 %x xq
+%x xqs
 %x xe
 %x xdolq
 %x xui
-%x xuiend
 %x xus
-%x xusend
+%x xuend
+%x xuchar
 %x xeu
 
 /*
@@ -231,19 +234,18 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 
+quote			'
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue	{whitespace_with_newline}{quote}
+
 /*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}).  To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}.  It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
  */
-quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+quotecontinuefail	{whitespace}*"-"?
 
 /* Bit string
  * It is tempting to scan the string for only those characters
@@ -304,10 +306,15 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+
 
-/* Unicode escapes */
-uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */
+uescape			[uU][eE][sS][cC][aA][pP][eE]
 /* error rule to avoid backup */
-uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
+uescapefail		[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
+
+/* escape character literal */
+uescchar		{quote}[^']{quote}
+/* error rule to avoid backup */
+uesccharfail	{quote}[^']|{quote}
 
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}
@@ -315,10 +322,6 @@ xuistart		[uU]&{dquote}
 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}
 
-/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
-xustop1		{uescapefail}?
-xustop2		{uescape}
-
 /* error rule to avoid backup */
 xufailed		[uU]&
 
@@ -476,21 +479,10 @@ other			.
 					startlit();
 					addlitchar('b', yyscanner);
 				}
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return BCONST;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng, yyscanner);
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					/* ignore */
-				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
 
 {xhstart}		{
@@ -505,13 +497,6 @@ other			.
 					startlit();
 					addlitchar('x', yyscanner);
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return XCONST;
-				}
 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
 
 {xnstart}		{
@@ -568,53 +553,71 @@ other			.
 					BEGIN(xus);
 					startlit();
 				}
-<xq,xe>{quotestop}	|
-<xq,xe>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
+
+<xb,xh,xq,xe,xus>{quote} {
 					/*
-					 * check that the data remains valid if it might have been
-					 * made invalid by unescaping any chars.
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the end quote was in fact
+					 * the end of the string.  To reduce the lexer table size,
+					 * we use a single "xqs" state to do the lookahead for all
+					 * types of strings.
 					 */
-					if (yyextra->saw_non_ascii)
-						pg_verifymbstr(yyextra->literalbuf,
-									   yyextra->literallen,
-									   false);
-					yylval->str = litbufdup(yyscanner);
-					return SCONST;
-				}
-<xus>{quotestop} |
-<xus>{quotefail} {
-					/* throw back all but the quote */
-					yyless(1);
-					/* xusend state looks for possible UESCAPE */
-					BEGIN(xusend);
+					yyextra->state_before_lit_stop = YYSTATE;
+					BEGIN(xqs);
 				}
-<xusend>{whitespace} {
-					/* stay in xusend state over whitespace */
+<xqs>{quotecontinue} {
+					/*
+					 * Found a quote continuation, so return to the in-quote
+					 * state and continue scanning the literal.
+					 */
+					BEGIN(yyextra->state_before_lit_stop);
 				}
-<xusend><<EOF>> |
-<xusend>{other} |
-<xusend>{xustop1} {
-					/* no UESCAPE after the quote, throw back everything */
+<xqs>{quotecontinuefail} |
+<xqs><<EOF>> |
+<xqs>{other}	{
+					/*
+					 * Failed to see a quote continuation.  Throw back
+					 * everything after the end quote, and handle the string
+					 * according to the state we were in previously.
+					 */
 					yyless(0);
-					BEGIN(INITIAL);
-					yylval->str = litbuf_udeescape('\\', yyscanner);
-					return SCONST;
-				}
-<xusend>{xustop2} {
-					/* found UESCAPE after the end quote */
-					BEGIN(INITIAL);
-					if (!check_uescapechar(yytext[yyleng - 2]))
+
+					switch (yyextra->state_before_lit_stop)
 					{
-						SET_YYLLOC();
-						ADVANCE_YYLLOC(yyleng - 2);
-						yyerror("invalid Unicode escape character");
+						case xb:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return BCONST;
+						case xh:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return XCONST;
+						case xq:
+							/* fallthrough */
+						case xe:
+							BEGIN(INITIAL);
+
+							/*
+							 * Check that the data remains valid if it
+							 * might have been made invalid by unescaping
+							 * any chars.
+							 */
+							if (yyextra->saw_non_ascii)
+								pg_verifymbstr(yyextra->literalbuf,
+											   yyextra->literallen,
+											   false);
+							yylval->str = litbufdup(yyscanner);
+							return SCONST;
+						case xus:
+							/* xuend state looks for possible UESCAPE */
+							BEGIN(xuend);
+							break;
+						default:
+							yyerror("unhandled previous state in xqs");
 					}
-					yylval->str = litbuf_udeescape(yytext[yyleng - 2],
-												   yyscanner);
-					return SCONST;
 				}
+
 <xq,xe,xus>{xqdouble} {
 					addlitchar('\'', yyscanner);
 				}
@@ -693,9 +696,6 @@ other			.
 					if (c == '\0' || IS_HIGHBIT_SET(c))
 						yyextra->saw_non_ascii = true;
 				}
-<xq,xe,xus>{quotecontinue} {
-					/* ignore */
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0], yyscanner);
@@ -770,53 +770,88 @@ other			.
 					return IDENT;
 				}
 <xui>{dquote} {
-					yyless(1);
-					/* xuiend state looks for possible UESCAPE */
-					BEGIN(xuiend);
+					if (yyextra->literallen == 0)
+						yyerror("zero-length delimited identifier");
+
+					/* xuend state looks for possible UESCAPE */
+					yyextra->state_before_lit_stop = YYSTATE;
+					BEGIN(xuend);
 				}
-<xuiend>{whitespace} {
-					/* stay in xuiend state over whitespace */
+
+<xuend,xuchar>{whitespace} {
+					/* stay in xuend or xuchar state over whitespace */
 				}
-<xuiend><<EOF>> |
-<xuiend>{other} |
-<xuiend>{xustop1} {
+<xuend>{uescapefail} |
+<xuend><<EOF>> |
+<xuend>{other}	{
 					/* no UESCAPE after the quote, throw back everything */
-					char	   *ident;
-					int			identlen;
-
 					yyless(0);
 
-					BEGIN(INITIAL);
-					if (yyextra->literallen == 0)
-						yyerror("zero-length delimited identifier");
-					ident = litbuf_udeescape('\\', yyscanner);
-					identlen = strlen(ident);
-					if (identlen >= NAMEDATALEN)
-						truncate_identifier(ident, identlen, true);
-					yylval->str = ident;
-					return IDENT;
+					if (yyextra->state_before_lit_stop == xus)
+					{
+						BEGIN(INITIAL);
+						yylval->str = litbuf_udeescape('\\', yyscanner);
+						return SCONST;
+					}
+					else if (yyextra->state_before_lit_stop == xui)
+					{
+						char	   *ident;
+						int			identlen;
+
+						BEGIN(INITIAL);
+						ident = litbuf_udeescape('\\', yyscanner);
+						identlen = strlen(ident);
+						if (identlen >= NAMEDATALEN)
+							truncate_identifier(ident, identlen, true);
+						yylval->str = ident;
+						return IDENT;
+					}
+					else
+						yyerror("unhandled previous state in xuend");
 				}
-<xuiend>{xustop2}	{
+<xuend>{uescape} {
 					/* found UESCAPE after the end quote */
-					char	   *ident;
-					int			identlen;
-
-					BEGIN(INITIAL);
-					if (yyextra->literallen == 0)
-						yyerror("zero-length delimited identifier");
+					BEGIN(xuchar);
+				}
+<xuchar>{uescchar} {
+					/* found escape character literal after UESCAPE */
 					if (!check_uescapechar(yytext[yyleng - 2]))
 					{
 						SET_YYLLOC();
 						ADVANCE_YYLLOC(yyleng - 2);
 						yyerror("invalid Unicode escape character");
 					}
-					ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
-					identlen = strlen(ident);
-					if (identlen >= NAMEDATALEN)
-						truncate_identifier(ident, identlen, true);
-					yylval->str = ident;
-					return IDENT;
+
+					if (yyextra->state_before_lit_stop == xus)
+					{
+						BEGIN(INITIAL);
+						yylval->str = litbuf_udeescape(yytext[yyleng - 2],
+													   yyscanner);
+						return SCONST;
+					}
+					else if (yyextra->state_before_lit_stop == xui)
+					{
+						char	   *ident;
+						int			identlen;
+
+						BEGIN(INITIAL);
+						ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
+						identlen = strlen(ident);
+						if (identlen >= NAMEDATALEN)
+							truncate_identifier(ident, identlen, true);
+						yylval->str = ident;
+						return IDENT;
+					}
+					else
+						yyerror("unhandled previous state in xuchar");
 				}
+<xuchar>{uesccharfail} |
+<xuchar><<EOF>> |
+<xuchar>{other} {
+					SET_YYLLOC();
+					yyerror("missing or invalid Unicode escape character");
+				}
+
 <xd,xui>{xddouble}	{
 					addlitchar('"', yyscanner);
 				}
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index ce20936339..a66c0f4c6e 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -114,12 +114,14 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xd> delimited identifiers (double-quoted identifiers)
  *  <xh> hexadecimal numeric string
  *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
- *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
  *  <xus> quoted string with Unicode escapes
- *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
+ *  <xuend> end of a quoted string or identifier with Unicode escapes,
+ *    UESCAPE can follow
+ *  <xuchar> expecting escape character literal after UESCAPE
  *
  * Note: we intentionally don't mimic the backend's <xeu> state; we have
  * no need to distinguish it from <xe> state, and no good way to get out
@@ -132,12 +134,13 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
 %x xd
 %x xh
 %x xq
+%x xqs
 %x xe
 %x xdolq
 %x xui
-%x xuiend
 %x xus
-%x xusend
+%x xuend
+%x xuchar
 
 /*
  * In order to make the world safe for Windows and Mac clients as well as
@@ -177,19 +180,18 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 
+quote			'
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue	{whitespace_with_newline}{quote}
+
 /*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}).  To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}.  It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
  */
-quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+quotecontinuefail	{whitespace}*"-"?
 
 /* Bit string
  * It is tempting to scan the string for only those characters
@@ -250,10 +252,15 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+
 
-/* Unicode escapes */
-uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */
+uescape			[uU][eE][sS][cC][aA][pP][eE]
 /* error rule to avoid backup */
-uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
+uescapefail		[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
+
+/* escape character literal */
+uescchar		{quote}[^']{quote}
+/* error rule to avoid backup */
+uesccharfail	{quote}[^']|{quote}
 
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}
@@ -261,10 +268,6 @@ xuistart		[uU]&{dquote}
 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}
 
-/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
-xustop1		{uescapefail}?
-xustop2		{uescape}
-
 /* error rule to avoid backup */
 xufailed		[uU]&
 
@@ -438,20 +441,10 @@ other			.
 					BEGIN(xb);
 					ECHO;
 				}
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					ECHO;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					ECHO;
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					ECHO;
-				}
 
 {xhstart}		{
 					/* Hexadecimal bit type.
@@ -463,12 +456,6 @@ other			.
 					BEGIN(xh);
 					ECHO;
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					ECHO;
-				}
 
 {xnstart}		{
 					yyless(1);	/* eat only 'n' this time */
@@ -490,32 +477,59 @@ other			.
 					BEGIN(xus);
 					ECHO;
 				}
-<xq,xe>{quotestop}	|
-<xq,xe>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					ECHO;
-				}
-<xus>{quotestop} |
-<xus>{quotefail} {
-					/* throw back all but the quote */
-					yyless(1);
-					BEGIN(xusend);
+
+<xb,xh,xq,xe,xus>{quote} {
+					/*
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the end quote was in fact
+					 * the end of the string.  To reduce the lexer table size,
+					 * we use a single "xqs" state to do the lookahead for all
+					 * types of strings.
+					 */
+					cur_state->state_before_lit_stop = YYSTATE;
+					BEGIN(xqs);
 					ECHO;
 				}
-<xusend>{whitespace} {
+<xqs>{quotecontinue} {
+					/*
+					 * Found a quote continuation, so return to the in-quote
+					 * state and continue scanning the literal.
+					 */
+					BEGIN(cur_state->state_before_lit_stop);
 					ECHO;
 				}
-<xusend>{other} |
-<xusend>{xustop1} {
+<xqs>{quotecontinuefail} |
+<xqs>{other}	{
+					/*
+					 * Failed to see a quote continuation.  Throw back
+					 * everything after the end quote, and enter start condition
+					 * according to the state we were in previously.
+					 */
 					yyless(0);
-					BEGIN(INITIAL);
-					ECHO;
-				}
-<xusend>{xustop2} {
-					BEGIN(INITIAL);
-					ECHO;
+
+					switch (cur_state->state_before_lit_stop)
+					{
+						case xb:
+							BEGIN(INITIAL);
+							break;
+						case xh:
+							BEGIN(INITIAL);
+							break;
+						case xq:
+							/* fallthrough */
+						case xe:
+							BEGIN(INITIAL);
+							break;
+						case xus:
+							/* xuend state looks for possible UESCAPE */
+							BEGIN(xuend);
+							break;
+						default:
+							fprintf(stderr, "unhandled previous state in xuend\n");
+					}
 				}
+
 <xq,xe,xus>{xqdouble} {
 					ECHO;
 				}
@@ -540,9 +554,6 @@ other			.
 <xe>{xehexesc}  {
 					ECHO;
 				}
-<xq,xe,xus>{quotecontinue} {
-					ECHO;
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					ECHO;
@@ -600,23 +611,39 @@ other			.
 					ECHO;
 				}
 <xui>{dquote} {
-					yyless(1);
-					BEGIN(xuiend);
+					/* xuend state looks for possible UESCAPE */
+					yyextra->state_before_lit_stop = YYSTATE;
+					BEGIN(xuend);
 					ECHO;
 				}
-<xuiend>{whitespace} {
+
+<xuend,xuchar>{whitespace} {
+					/* stay in xuend or xuchar state over whitespace */
 					ECHO;
 				}
-<xuiend>{other} |
-<xuiend>{xustop1} {
+<xuend>{uescapefail} |
+<xuend>{other}	{
+					/* no UESCAPE after the quote, throw back everything */
 					yyless(0);
 					BEGIN(INITIAL);
 					ECHO;
 				}
-<xuiend>{xustop2}	{
+<xuend>{uescape} {
+					/* found UESCAPE after the end quote */
+					BEGIN(xuchar);
+					ECHO;
+				}
+<xuchar>{uescchar} {
+					/* found escape character literal after UESCAPE */
 					BEGIN(INITIAL);
 					ECHO;
 				}
+<xuchar>{uesccharfail} |
+<xuchar>{other} {
+					BEGIN(INITIAL);
+					ECHO;
+				}
+
 <xd,xui>{xddouble}	{
 					ECHO;
 				}
@@ -1084,8 +1111,9 @@ psql_scan(PsqlScanState state,
 			switch (state->start_state)
 			{
 				case INITIAL:
-				case xuiend:	/* we treat these like INITIAL */
-				case xusend:
+				case xqs:		/* we treat these like INITIAL */
+				case xuend:
+				case xuchar:
 					if (state->paren_depth > 0)
 					{
 						result = PSCAN_INCOMPLETE;
@@ -1240,7 +1268,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state)
 bool
 psql_scan_in_quote(PsqlScanState state)
 {
-	return state->start_state != INITIAL;
+	return state->start_state != INITIAL &&
+			state->start_state != xqs;
 }
 
 /*
diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h
index 2acb380078..00567c1b1e 100644
--- a/src/include/fe_utils/psqlscan_int.h
+++ b/src/include/fe_utils/psqlscan_int.h
@@ -110,6 +110,7 @@ typedef struct PsqlScanStateData
 	 * and updated with its finishing state on exit.
 	 */
 	int			start_state;	/* yylex's starting/finishing state */
+	int			state_before_lit_stop;	/* start cond. before end quote */
 	int			paren_depth;	/* depth of nesting in parentheses */
 	int			xcdepth;		/* depth of nesting in slash-star comments */
 	char	   *dolqstart;		/* current $foo$ quote start string */
diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h
index 731a2bd264..256c1570bf 100644
--- a/src/include/parser/scanner.h
+++ b/src/include/parser/scanner.h
@@ -99,6 +99,7 @@ typedef struct core_yy_extra_type
 	int			literallen;		/* actual current string length */
 	int			literalalloc;	/* current allocated buffer size */
 
+	int			state_before_lit_stop;	/* start cond. before end quote */
 	int			xcdepth;		/* depth of nesting in slash-star comments */
 	char	   *dolqstart;		/* current $foo$ quote start string */
 
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index 488c89b7f4..1eefbc05f6 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -6,6 +6,9 @@
  *
  * This is a modified version of src/backend/parser/scan.l
  *
+ * The ecpg scanner is not backup-free, so the fail rules are
+ * only here to simplify syncing this file with scan.l.
+ *
  *
  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -61,7 +64,10 @@ static bool isdefine(void);
 static bool isinformixdefine(void);
 
 char *token_start;
-static int state_before;
+
+/* vars to keep track of start conditions when scanning literals */
+static int state_before_lit_start;
+static int state_before_lit_stop;
 
 struct _yy_buffer
 {
@@ -112,14 +118,21 @@ static struct _if_value
  *  <xh> hexadecimal numeric string
  *  <xn> national character quoted strings
  *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xqc> single-quoted strings in C
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
+ *  <xuend> end of a quoted string or identifier with Unicode escapes,
+ *    UESCAPE can follow
+ *  <xuchar> expecting escape character literal after UESCAPE
  *  <xcond> condition of an EXEC SQL IFDEF construct
  *  <xskip> skipping the inactive part of an EXEC SQL IFDEF construct
  *
+ * Note: we intentionally don't mimic the backend's <xeu> state; we have
+ * no need to distinguish it from <xe> state.
+ *
  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
  * The default one is probably not the right thing.
  */
@@ -132,11 +145,14 @@ static struct _if_value
 %x xh
 %x xn
 %x xq
+%x xqs
 %x xe
 %x xqc
 %x xdolq
 %x xui
 %x xus
+%x xuend
+%x xuchar
 %x xcond
 %x xskip
 
@@ -181,9 +197,17 @@ horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{whitespace}*)
 
 quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue	{whitespace_with_newline}{quote}
+
+/*
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}.  It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
+ */
+quotecontinuefail	{whitespace}*"-"?
 
 /* Bit string
  */
@@ -237,19 +261,21 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+
 
-/* Unicode escapes */
-/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are
- * not needed here, but could be added if desired.)
- */
-uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */
+uescape			[uU][eE][sS][cC][aA][pP][eE]
+/* error rule to avoid backup */
+uescapefail		[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
+
+/* escape character literal */
+uescchar		{quote}[^']{quote}
+/* error rule to avoid backup */
+uesccharfail	{quote}[^']|{quote}
 
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}
-xuistop			{dquote}({whitespace}*{uescape})?
 
 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}
-xusstop			{quote}({whitespace}*{uescape})?
 
 /* special stuff for C strings */
 xdcqq			\\\\
@@ -411,7 +437,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 
 {xcstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					xcdepth = 0;
 					BEGIN(xcsql);
 					/* Put back any characters past slash-star; see above */
@@ -422,7 +448,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 
 <C>{xcstart}	{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					xcdepth = 0;
 					BEGIN(xcc);
 					/* Put back any characters past slash-star; see above */
@@ -440,7 +466,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					if (xcdepth <= 0)
 					{
 						ECHO;
-						BEGIN(state_before);
+						BEGIN(state_before_lit_start);
 						token_start = NULL;
 					}
 					else
@@ -451,7 +477,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 <xcc>{xcstop}	{
 					ECHO;
-					BEGIN(state_before);
+					BEGIN(state_before_lit_start);
 					token_start = NULL;
 				}
 
@@ -482,23 +508,10 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 } /* <SQL> */
 
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(SQL);
-					if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
-						mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
-					base_yylval.str = mm_strdup(literalbuf);
-					return BCONST;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng);
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					/* ignore */
-				}
 <xb><<EOF>>		{ mmfatal(PARSE_ERROR, "unterminated bit string literal"); }
 
 <SQL>{xhstart}	{
@@ -507,19 +520,11 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					startlit();
 					addlitchar('x');
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(SQL);
-					base_yylval.str = mm_strdup(literalbuf);
-					return XCONST;
-				}
-
 <xh><<EOF>>		{ mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); }
 
 <C>{xqstart}	{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xqc);
 					startlit();
 				}
@@ -530,59 +535,98 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					 * Transfer it as-is to the backend.
 					 */
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xn);
 					startlit();
 				}
 
 {xqstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xq);
 					startlit();
 				}
 {xestart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xe);
 					startlit();
 				}
 {xusstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xus);
 					startlit();
 					addlit(yytext, yyleng);
 				}
 } /* <SQL> */
 
-<xq,xqc>{quotestop} |
-<xq,xqc>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return SCONST;
-				}
-<xe>{quotestop} |
-<xe>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return ECONST;
+<xb,xh,xq,xqc,xe,xn,xus>{quote} {
+					/*
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the end quote was in fact
+					 * the end of the string.  To reduce the lexer table size,
+					 * we use a single "xqs" state to do the lookahead for all
+					 * types of strings.
+					 */
+					state_before_lit_stop = YYSTATE;
+					BEGIN(xqs);
 				}
-<xn>{quotestop} |
-<xn>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return NCONST;
+<xqs>{quotecontinue} {
+					/*
+					 * Found a quote continuation, so return to the in-quote
+					 * state and continue scanning the literal.
+					 */
+					BEGIN(state_before_lit_stop);
 				}
-<xus>{xusstop} {
-					addlit(yytext, yyleng);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return UCONST;
+<xqs>{quotecontinuefail} |
+<xqs><<EOF>> |
+<xqs>{other}	{
+					/*
+					 * Failed to see a quote continuation.  Throw back
+					 * everything after the end quote, and handle the string
+					 * according to the state we were in previously.
+					 */
+					yyless(0);
+
+					switch (state_before_lit_stop)
+					{
+						case xb:
+							BEGIN(state_before_lit_start);
+							if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
+								mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
+							base_yylval.str = mm_strdup(literalbuf);
+							return BCONST;
+						case xh:
+							BEGIN(state_before_lit_start);
+							base_yylval.str = mm_strdup(literalbuf);
+							return XCONST;
+						case xq:
+							/* fallthrough */
+						case xqc:
+							BEGIN(state_before_lit_start);
+							base_yylval.str = mm_strdup(literalbuf);
+							return SCONST;
+						case xe:
+							BEGIN(state_before_lit_start);
+							base_yylval.str = mm_strdup(literalbuf);
+							return ECONST;
+						case xn:
+							BEGIN(state_before_lit_start);
+							base_yylval.str = mm_strdup(literalbuf);
+							return NCONST;
+						case xus:
+							/* xuend state looks for possible UESCAPE */
+							BEGIN(xuend);
+							/* add end quote for the backend */
+							addlitchar('\'');
+							break;
+						default:
+							mmfatal(PARSE_ERROR, "unhandled previous state in xuend\n");
+					}
 				}
+
 <xq,xe,xn,xus>{xqdouble}	{ addlitchar('\''); }
 <xqc>{xqcquote}	{
 					addlitchar('\\');
@@ -604,9 +648,6 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 <xe>{xehexesc}  {
 					addlit(yytext, yyleng);
 				}
-<xq,xqc,xe,xn,xus>{quotecontinue}	{
-					/* ignore */
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0]);
@@ -666,12 +707,12 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 
 <SQL>{
 {xdstart}		{
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xd);
 					startlit();
 				}
 {xuistart}		{
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xui);
 					startlit();
 					addlit(yytext, yyleng);
@@ -679,7 +720,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 } /* <SQL> */
 
 <xd>{xdstop}	{
-					BEGIN(state_before);
+					BEGIN(state_before_lit_start);
 					if (literallen == 0)
 						mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
 					/* The backend will truncate the identifier here. We do not as it does not change the result. */
@@ -687,19 +728,85 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					return CSTRING;
 				}
 <xdc>{xdstop}	{
-					BEGIN(state_before);
+					BEGIN(state_before_lit_start);
 					base_yylval.str = mm_strdup(literalbuf);
 					return CSTRING;
 				}
-<xui>{xuistop}	{
-					BEGIN(state_before);
+
+<xui>{dquote}	{
 					if (literallen == 2) /* "U&" */
 						mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
-					/* The backend will truncate the identifier here. We do not as it does not change the result. */
+					/* xuend state looks for possible UESCAPE */
+					state_before_lit_stop = YYSTATE;
+					BEGIN(xuend);
 					addlit(yytext, yyleng);
-					base_yylval.str = mm_strdup(literalbuf);
-					return UIDENT;
 				}
+
+<xuend,xuchar>{whitespace} {
+					/* stay in xuend or xuchar state over whitespace */
+				}
+<xuend>{uescapefail} |
+<xuend><<EOF>> |
+<xuend>{other}	{
+					/* no UESCAPE after the quote, throw back everything */
+					yyless(0);
+					BEGIN(state_before_lit_start);
+
+					if (state_before_lit_stop == xus)
+					{
+						base_yylval.str = mm_strdup(literalbuf);
+						return UCONST;
+					}
+					else if (state_before_lit_stop == xui)
+					{
+						/*
+						 * The backend will truncate the identifier here.
+						 * We do not as it does not change the result.
+						 */
+						base_yylval.str = mm_strdup(literalbuf);
+						return UIDENT;
+					}
+					else
+						mmfatal(PARSE_ERROR, "unhandled previous state in xuend");
+				}
+<xuend>{uescape} {
+					/* found UESCAPE after the end quote */
+					BEGIN(xuchar);
+					/* normalize whitespace */
+					addlitchar(' ');
+					addlit(yytext, yyleng);
+				}
+<xuchar>{uescchar} {
+					/* found escape character literal after UESCAPE */
+					BEGIN(state_before_lit_start);
+					/* normalize whitespace */
+					addlitchar(' ');
+					addlit(yytext, yyleng);
+
+					if (state_before_lit_stop == xus)
+					{
+						base_yylval.str = mm_strdup(literalbuf);
+						return UCONST;
+					}
+					else if (state_before_lit_stop == xui)
+					{
+						/*
+						 * The backend will truncate the identifier here.
+						 * We do not as it does not change the result.
+						 */
+						base_yylval.str = mm_strdup(literalbuf);
+						return UIDENT;
+					}
+					else
+						mmfatal(PARSE_ERROR, "unhandled previous state in xuchar");
+				}
+<xuchar>{uesccharfail} |
+<xuchar><<EOF>> |
+<xuchar>{other} {
+					BEGIN(state_before_lit_start);
+					mmerror(PARSE_ERROR, ET_ERROR, "missing or invalid Unicode escape character");
+				}
+
 <xd,xui>{xddouble}	{
 					addlitchar('"');
 				}
@@ -708,7 +815,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 <xd,xui><<EOF>>	{ mmfatal(PARSE_ERROR, "unterminated quoted identifier"); }
 <C>{xdstart}	{
-					state_before = YYSTATE;
+					state_before_lit_start = YYSTATE;
 					BEGIN(xdc);
 					startlit();
 				}
-- 
2.17.2 (Apple Git-113)