From 5ea5886fb44e8bc85753400ea4b1375daf8b2d2d Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 12 Jul 2019 13:16:44 +0700 Subject: [PATCH v6 1/2] Reduce the number of states in the core scanner table Previously, the core scanner had 37045 states, which required Flex to use 32-bit types in the yy_transition array. Refactor the Flex rules to reduce the number of states to 22331. With 16-bit types, this shrinks the backend binary by about 200kB. 1. When Flex encounters a quote while inside any kind of quoted string, it saves the current start condition and enters a new one in order to detect possible string continuations. 2. Unify xusend and xuiend into a single start condition to detect a possible UESCAPE. If one is found, enter a new start condition to scan the escape character. Sync psql and ECPG scanners to match. --- src/backend/parser/scan.l | 265 ++++++++++++++++------------ src/fe_utils/psqlscan.l | 169 ++++++++++-------- src/include/fe_utils/psqlscan_int.h | 1 + src/include/parser/scanner.h | 1 + src/interfaces/ecpg/preproc/pgc.l | 263 +++++++++++++++++++-------- 5 files changed, 436 insertions(+), 263 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e1cae859e8..d2ccb438f6 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -168,12 +168,14 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow + * end of a quoted string or identifier with Unicode escapes, + * UESCAPE can follow + * expecting escape character literal after UESCAPE * Unicode surrogate pair in extended quoted string * * Remember to add an <> case whenever you add a new exclusive state! @@ -185,12 +187,13 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend +%x xuend +%x xuchar %x xeu /* @@ -231,19 +234,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -304,10 +306,15 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE] /* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] +uescapefail [uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] + +/* escape character literal */ +uescchar {quote}[^']{quote} +/* error rule to avoid backup */ +uesccharfail {quote}[^']|{quote} /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} @@ -315,10 +322,6 @@ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -476,21 +479,10 @@ other . startlit(); addlitchar('b', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng, yyscanner); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -505,13 +497,6 @@ other . startlit(); addlitchar('x', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -568,53 +553,71 @@ other . BEGIN(xus); startlit(); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + +{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); + yyextra->state_before_lit_stop = YYSTATE; + BEGIN(xqs); } -{whitespace} { - /* stay in xusend state over whitespace */ +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(yyextra->state_before_lit_stop); } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ +{quotecontinuefail} | +<> | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ yyless(0); - BEGIN(INITIAL); - yylval->str = litbuf_udeescape('\\', yyscanner); - return SCONST; - } -{xustop2} { - /* found UESCAPE after the end quote */ - BEGIN(INITIAL); - if (!check_uescapechar(yytext[yyleng - 2])) + + switch (yyextra->state_before_lit_stop) { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); + case xb: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return XCONST; + case xq: + /* fallthrough */ + case xe: + BEGIN(INITIAL); + + /* + * Check that the data remains valid if it + * might have been made invalid by unescaping + * any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + /* xuend state looks for possible UESCAPE */ + BEGIN(xuend); + break; + default: + yyerror("unhandled previous state in xqs"); } - yylval->str = litbuf_udeescape(yytext[yyleng - 2], - yyscanner); - return SCONST; } + {xqdouble} { addlitchar('\'', yyscanner); } @@ -693,9 +696,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); @@ -770,53 +770,88 @@ other . return IDENT; } {dquote} { - yyless(1); - /* xuiend state looks for possible UESCAPE */ - BEGIN(xuiend); + if (yyextra->literallen == 0) + yyerror("zero-length delimited identifier"); + + /* xuend state looks for possible UESCAPE */ + yyextra->state_before_lit_stop = YYSTATE; + BEGIN(xuend); } -{whitespace} { - /* stay in xuiend state over whitespace */ + +{whitespace} { + /* stay in xuend or xuchar state over whitespace */ } -<> | -{other} | -{xustop1} { +{uescapefail} | +<> | +{other} { /* no UESCAPE after the quote, throw back everything */ - char *ident; - int identlen; - yyless(0); - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); - ident = litbuf_udeescape('\\', yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + if (yyextra->state_before_lit_stop == xus) + { + BEGIN(INITIAL); + yylval->str = litbuf_udeescape('\\', yyscanner); + return SCONST; + } + else if (yyextra->state_before_lit_stop == xui) + { + char *ident; + int identlen; + + BEGIN(INITIAL); + ident = litbuf_udeescape('\\', yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + yylval->str = ident; + return IDENT; + } + else + yyerror("unhandled previous state in xuend"); } -{xustop2} { +{uescape} { /* found UESCAPE after the end quote */ - char *ident; - int identlen; - - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); + BEGIN(xuchar); + } +{uescchar} { + /* found escape character literal after UESCAPE */ if (!check_uescapechar(yytext[yyleng - 2])) { SET_YYLLOC(); ADVANCE_YYLLOC(yyleng - 2); yyerror("invalid Unicode escape character"); } - ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + + if (yyextra->state_before_lit_stop == xus) + { + BEGIN(INITIAL); + yylval->str = litbuf_udeescape(yytext[yyleng - 2], + yyscanner); + return SCONST; + } + else if (yyextra->state_before_lit_stop == xui) + { + char *ident; + int identlen; + + BEGIN(INITIAL); + ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + yylval->str = ident; + return IDENT; + } + else + yyerror("unhandled previous state in xuchar"); } +{uesccharfail} | +<> | +{other} { + SET_YYLLOC(); + yyerror("missing or invalid Unicode escape character"); + } + {xddouble} { addlitchar('"', yyscanner); } diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index ce20936339..a66c0f4c6e 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -114,12 +114,14 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow + * end of a quoted string or identifier with Unicode escapes, + * UESCAPE can follow + * expecting escape character literal after UESCAPE * * Note: we intentionally don't mimic the backend's state; we have * no need to distinguish it from state, and no good way to get out @@ -132,12 +134,13 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend +%x xuend +%x xuchar /* * In order to make the world safe for Windows and Mac clients as well as @@ -177,19 +180,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -250,10 +252,15 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE] /* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] +uescapefail [uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] + +/* escape character literal */ +uescchar {quote}[^']{quote} +/* error rule to avoid backup */ +uesccharfail {quote}[^']|{quote} /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} @@ -261,10 +268,6 @@ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -438,20 +441,10 @@ other . BEGIN(xb); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } {xhinside} | {xbinside} { ECHO; } -{quotecontinue} | -{quotecontinue} { - ECHO; - } {xhstart} { /* Hexadecimal bit type. @@ -463,12 +456,6 @@ other . BEGIN(xh); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } {xnstart} { yyless(1); /* eat only 'n' this time */ @@ -490,32 +477,59 @@ other . BEGIN(xus); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - BEGIN(xusend); + +{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + cur_state->state_before_lit_stop = YYSTATE; + BEGIN(xqs); ECHO; } -{whitespace} { +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(cur_state->state_before_lit_stop); ECHO; } -{other} | -{xustop1} { +{quotecontinuefail} | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and enter start condition + * according to the state we were in previously. + */ yyless(0); - BEGIN(INITIAL); - ECHO; - } -{xustop2} { - BEGIN(INITIAL); - ECHO; + + switch (cur_state->state_before_lit_stop) + { + case xb: + BEGIN(INITIAL); + break; + case xh: + BEGIN(INITIAL); + break; + case xq: + /* fallthrough */ + case xe: + BEGIN(INITIAL); + break; + case xus: + /* xuend state looks for possible UESCAPE */ + BEGIN(xuend); + break; + default: + fprintf(stderr, "unhandled previous state in xuend\n"); + } } + {xqdouble} { ECHO; } @@ -540,9 +554,6 @@ other . {xehexesc} { ECHO; } -{quotecontinue} { - ECHO; - } . { /* This is only needed for \ just before EOF */ ECHO; @@ -600,23 +611,39 @@ other . ECHO; } {dquote} { - yyless(1); - BEGIN(xuiend); + /* xuend state looks for possible UESCAPE */ + yyextra->state_before_lit_stop = YYSTATE; + BEGIN(xuend); ECHO; } -{whitespace} { + +{whitespace} { + /* stay in xuend or xuchar state over whitespace */ ECHO; } -{other} | -{xustop1} { +{uescapefail} | +{other} { + /* no UESCAPE after the quote, throw back everything */ yyless(0); BEGIN(INITIAL); ECHO; } -{xustop2} { +{uescape} { + /* found UESCAPE after the end quote */ + BEGIN(xuchar); + ECHO; + } +{uescchar} { + /* found escape character literal after UESCAPE */ BEGIN(INITIAL); ECHO; } +{uesccharfail} | +{other} { + BEGIN(INITIAL); + ECHO; + } + {xddouble} { ECHO; } @@ -1084,8 +1111,9 @@ psql_scan(PsqlScanState state, switch (state->start_state) { case INITIAL: - case xuiend: /* we treat these like INITIAL */ - case xusend: + case xqs: /* we treat these like INITIAL */ + case xuend: + case xuchar: if (state->paren_depth > 0) { result = PSCAN_INCOMPLETE; @@ -1240,7 +1268,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state) bool psql_scan_in_quote(PsqlScanState state) { - return state->start_state != INITIAL; + return state->start_state != INITIAL && + state->start_state != xqs; } /* diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h index 2acb380078..00567c1b1e 100644 --- a/src/include/fe_utils/psqlscan_int.h +++ b/src/include/fe_utils/psqlscan_int.h @@ -110,6 +110,7 @@ typedef struct PsqlScanStateData * and updated with its finishing state on exit. */ int start_state; /* yylex's starting/finishing state */ + int state_before_lit_stop; /* start cond. before end quote */ int paren_depth; /* depth of nesting in parentheses */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h index 731a2bd264..256c1570bf 100644 --- a/src/include/parser/scanner.h +++ b/src/include/parser/scanner.h @@ -99,6 +99,7 @@ typedef struct core_yy_extra_type int literallen; /* actual current string length */ int literalalloc; /* current allocated buffer size */ + int state_before_lit_stop; /* start cond. before end quote */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index 488c89b7f4..1eefbc05f6 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -6,6 +6,9 @@ * * This is a modified version of src/backend/parser/scan.l * + * The ecpg scanner is not backup-free, so the fail rules are + * only here to simplify syncing this file with scan.l. + * * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -61,7 +64,10 @@ static bool isdefine(void); static bool isinformixdefine(void); char *token_start; -static int state_before; + +/* vars to keep track of start conditions when scanning literals */ +static int state_before_lit_start; +static int state_before_lit_stop; struct _yy_buffer { @@ -112,14 +118,21 @@ static struct _if_value * hexadecimal numeric string * national character quoted strings * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * single-quoted strings in C * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes + * end of a quoted string or identifier with Unicode escapes, + * UESCAPE can follow + * expecting escape character literal after UESCAPE * condition of an EXEC SQL IFDEF construct * skipping the inactive part of an EXEC SQL IFDEF construct * + * Note: we intentionally don't mimic the backend's state; we have + * no need to distinguish it from state. + * * Remember to add an <> case whenever you add a new exclusive state! * The default one is probably not the right thing. */ @@ -132,11 +145,14 @@ static struct _if_value %x xh %x xn %x xq +%x xqs %x xe %x xqc %x xdolq %x xui %x xus +%x xuend +%x xuchar %x xcond %x xskip @@ -181,9 +197,17 @@ horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*) quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + +/* + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. + */ +quotecontinuefail {whitespace}*"-"? /* Bit string */ @@ -237,19 +261,21 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are - * not needed here, but could be added if desired.) - */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE] +/* error rule to avoid backup */ +uescapefail [uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] + +/* escape character literal */ +uescchar {quote}[^']{quote} +/* error rule to avoid backup */ +uesccharfail {quote}[^']|{quote} /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} -xuistop {dquote}({whitespace}*{uescape})? /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -xusstop {quote}({whitespace}*{uescape})? /* special stuff for C strings */ xdcqq \\\\ @@ -411,7 +437,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ {xcstart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; xcdepth = 0; BEGIN(xcsql); /* Put back any characters past slash-star; see above */ @@ -422,7 +448,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ {xcstart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; xcdepth = 0; BEGIN(xcc); /* Put back any characters past slash-star; see above */ @@ -440,7 +466,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ if (xcdepth <= 0) { ECHO; - BEGIN(state_before); + BEGIN(state_before_lit_start); token_start = NULL; } else @@ -451,7 +477,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } {xcstop} { ECHO; - BEGIN(state_before); + BEGIN(state_before_lit_start); token_start = NULL; } @@ -482,23 +508,10 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } } /* */ -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(SQL); - if (literalbuf[strspn(literalbuf, "01") + 1] != '\0') - mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal"); - base_yylval.str = mm_strdup(literalbuf); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { mmfatal(PARSE_ERROR, "unterminated bit string literal"); } {xhstart} { @@ -507,19 +520,11 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ startlit(); addlitchar('x'); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(SQL); - base_yylval.str = mm_strdup(literalbuf); - return XCONST; - } - <> { mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); } {xqstart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xqc); startlit(); } @@ -530,59 +535,98 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ * Transfer it as-is to the backend. */ token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xn); startlit(); } {xqstart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xq); startlit(); } {xestart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xe); startlit(); } {xusstart} { token_start = yytext; - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xus); startlit(); addlit(yytext, yyleng); } } /* */ -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return SCONST; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return ECONST; +{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + state_before_lit_stop = YYSTATE; + BEGIN(xqs); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return NCONST; +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(state_before_lit_stop); } -{xusstop} { - addlit(yytext, yyleng); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return UCONST; +{quotecontinuefail} | +<> | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ + yyless(0); + + switch (state_before_lit_stop) + { + case xb: + BEGIN(state_before_lit_start); + if (literalbuf[strspn(literalbuf, "01") + 1] != '\0') + mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal"); + base_yylval.str = mm_strdup(literalbuf); + return BCONST; + case xh: + BEGIN(state_before_lit_start); + base_yylval.str = mm_strdup(literalbuf); + return XCONST; + case xq: + /* fallthrough */ + case xqc: + BEGIN(state_before_lit_start); + base_yylval.str = mm_strdup(literalbuf); + return SCONST; + case xe: + BEGIN(state_before_lit_start); + base_yylval.str = mm_strdup(literalbuf); + return ECONST; + case xn: + BEGIN(state_before_lit_start); + base_yylval.str = mm_strdup(literalbuf); + return NCONST; + case xus: + /* xuend state looks for possible UESCAPE */ + BEGIN(xuend); + /* add end quote for the backend */ + addlitchar('\''); + break; + default: + mmfatal(PARSE_ERROR, "unhandled previous state in xuend\n"); + } } + {xqdouble} { addlitchar('\''); } {xqcquote} { addlitchar('\\'); @@ -604,9 +648,6 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ {xehexesc} { addlit(yytext, yyleng); } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0]); @@ -666,12 +707,12 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ { {xdstart} { - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xd); startlit(); } {xuistart} { - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xui); startlit(); addlit(yytext, yyleng); @@ -679,7 +720,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } /* */ {xdstop} { - BEGIN(state_before); + BEGIN(state_before_lit_start); if (literallen == 0) mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); /* The backend will truncate the identifier here. We do not as it does not change the result. */ @@ -687,19 +728,85 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ return CSTRING; } {xdstop} { - BEGIN(state_before); + BEGIN(state_before_lit_start); base_yylval.str = mm_strdup(literalbuf); return CSTRING; } -{xuistop} { - BEGIN(state_before); + +{dquote} { if (literallen == 2) /* "U&" */ mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); - /* The backend will truncate the identifier here. We do not as it does not change the result. */ + /* xuend state looks for possible UESCAPE */ + state_before_lit_stop = YYSTATE; + BEGIN(xuend); addlit(yytext, yyleng); - base_yylval.str = mm_strdup(literalbuf); - return UIDENT; } + +{whitespace} { + /* stay in xuend or xuchar state over whitespace */ + } +{uescapefail} | +<> | +{other} { + /* no UESCAPE after the quote, throw back everything */ + yyless(0); + BEGIN(state_before_lit_start); + + if (state_before_lit_stop == xus) + { + base_yylval.str = mm_strdup(literalbuf); + return UCONST; + } + else if (state_before_lit_stop == xui) + { + /* + * The backend will truncate the identifier here. + * We do not as it does not change the result. + */ + base_yylval.str = mm_strdup(literalbuf); + return UIDENT; + } + else + mmfatal(PARSE_ERROR, "unhandled previous state in xuend"); + } +{uescape} { + /* found UESCAPE after the end quote */ + BEGIN(xuchar); + /* normalize whitespace */ + addlitchar(' '); + addlit(yytext, yyleng); + } +{uescchar} { + /* found escape character literal after UESCAPE */ + BEGIN(state_before_lit_start); + /* normalize whitespace */ + addlitchar(' '); + addlit(yytext, yyleng); + + if (state_before_lit_stop == xus) + { + base_yylval.str = mm_strdup(literalbuf); + return UCONST; + } + else if (state_before_lit_stop == xui) + { + /* + * The backend will truncate the identifier here. + * We do not as it does not change the result. + */ + base_yylval.str = mm_strdup(literalbuf); + return UIDENT; + } + else + mmfatal(PARSE_ERROR, "unhandled previous state in xuchar"); + } +{uesccharfail} | +<> | +{other} { + BEGIN(state_before_lit_start); + mmerror(PARSE_ERROR, ET_ERROR, "missing or invalid Unicode escape character"); + } + {xddouble} { addlitchar('"'); } @@ -708,7 +815,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } <> { mmfatal(PARSE_ERROR, "unterminated quoted identifier"); } {xdstart} { - state_before = YYSTATE; + state_before_lit_start = YYSTATE; BEGIN(xdc); startlit(); } -- 2.17.2 (Apple Git-113)