From a5ff502fceadc7c203b0d7a11b45c73f1b421f69 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Mar 2013 19:00:09 +0200 Subject: [PATCH] Change the way UESCAPE is lexed, to reduce the size of the flex tables. The error rule used to avoid backtracking with the U&'...' UESCAPE 'x' syntax bloated the flex tables, so refactor that. This patch makes the error rule shorter, by introducing a new exclusive flex state that's entered after parsing U&'...'. This shrinks the postgres binary by about 220kB. --- src/backend/parser/scan.l | 81 ++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 23c83c4fd9..92f38a2a07 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -97,6 +97,7 @@ static bool is_utf16_surrogate_first(pg_wchar c); static bool is_utf16_surrogate_second(pg_wchar c); static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); static void addunicode(pg_wchar c, yyscan_t yyscanner); +static bool check_uescapechar(unsigned char escape); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -150,7 +151,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes + * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes + * end of a quoted string with Unicode escapes, UESCAPE can follow * Unicode surrogate pair in extended quoted string */ @@ -162,7 +165,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xq %x xdolq %x xui +%x xuiend %x xus +%x xusend %x xeu /* @@ -279,17 +284,17 @@ xdinside [^"]+ /* Unicode escapes */ uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} /* error rule to avoid backup */ -uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) +uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} -xuistop1 {dquote}{whitespace}*{uescapefail}? -xuistop2 {dquote}{whitespace}*{uescape} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -xusstop1 {quote}{whitespace}*{uescapefail}? -xusstop2 {quote}{whitespace}*{uescape} + +/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ +xustop1 {uescapefail}? +xustop2 {uescape} /* error rule to avoid backup */ xufailed [uU]& @@ -536,15 +541,31 @@ other . yylval->str = litbufdup(yyscanner); return SCONST; } -{xusstop1} { +{quotestop} | +{quotefail} { /* throw back all but the quote */ yyless(1); + /* handle possible UESCAPE in xusend mode */ + BEGIN(xusend); + } +{whitespace} +{other} | +{xustop1} { + /* no UESCAPE after the quote, throw back everything */ + yyless(0); BEGIN(INITIAL); yylval->str = litbuf_udeescape('\\', yyscanner); return SCONST; } -{xusstop2} { +{xustop2} { + /* found UESCAPE after the end quote */ BEGIN(INITIAL); + if (!check_uescapechar(yytext[yyleng-2])) + { + SET_YYLLOC(); + ADVANCE_YYLLOC(yyleng-2); + yyerror("invalid Unicode escape character"); + } yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner); return SCONST; } @@ -702,9 +723,19 @@ other . yylval->str = ident; return IDENT; } -{xuistop1} { +{dquote} { + yyless(1); + /* handle possible UESCAPE in xuiend mode */ + BEGIN(xuiend); + } +{whitespace} { } +{other} | +{xustop1} { + /* no UESCAPE after the quote, throw back everything */ char *ident; + yyless(0); + BEGIN(INITIAL); if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); @@ -712,16 +743,21 @@ other . if (yyextra->literallen >= NAMEDATALEN) truncate_identifier(ident, yyextra->literallen, true); yylval->str = ident; - /* throw back all but the quote */ - yyless(1); return IDENT; } -{xuistop2} { +{xustop2} { + /* found UESCAPE after the end quote */ char *ident; BEGIN(INITIAL); if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); + if (!check_uescapechar(yytext[yyleng-2])) + { + SET_YYLLOC(); + ADVANCE_YYLLOC(yyleng-2); + yyerror("invalid Unicode escape character"); + } ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); if (yyextra->literallen >= NAMEDATALEN) truncate_identifier(ident, yyextra->literallen, true); @@ -1203,22 +1239,29 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner) addlit(buf, pg_mblen(buf), yyscanner); } -static char * -litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) { - char *new; - char *litbuf, *in, *out; - pg_wchar pair_first = 0; - if (isxdigit(escape) || escape == '+' || escape == '\'' || escape == '"' || scanner_isspace(escape)) { - ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1); - yyerror("invalid Unicode escape character"); + return false; } + else + return true; +} + +/* like litbufdup, but handle unicode escapes */ +static char * +litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) +{ + char *new; + char *litbuf, *in, *out; + pg_wchar pair_first = 0; /* Make literalbuf null-terminated to simplify the scanning loop */ litbuf = yyextra->literalbuf;