From a5ff502fceadc7c203b0d7a11b45c73f1b421f69 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 14 Mar 2013 19:00:09 +0200
Subject: [PATCH] Change the way UESCAPE is lexed, to reduce the size of the
 flex tables.

The error rule used to avoid backtracking with the U&'...' UESCAPE 'x'
syntax bloated the flex tables, so refactor that. This patch makes the error
rule shorter, by introducing a new exclusive flex state that's entered after
parsing U&'...'. This shrinks the postgres binary by about 220kB.
---
 src/backend/parser/scan.l | 81 ++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 19 deletions(-)
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 23c83c4fd9..92f38a2a07 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -97,6 +97,7 @@ static bool is_utf16_surrogate_first(pg_wchar c);
 static bool is_utf16_surrogate_second(pg_wchar c);
 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
 static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static bool check_uescapechar(unsigned char escape);
 
 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
 
@@ -150,7 +151,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
+ *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
  *  <xus> quoted string with Unicode escapes
+ *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
  *  <xeu> Unicode surrogate pair in extended quoted string
  */
 
@@ -162,7 +165,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 %x xq
 %x xdolq
 %x xui
+%x xuiend
 %x xus
+%x xusend
 %x xeu
 
 /*
@@ -279,17 +284,17 @@ xdinside		[^"]+
 /* Unicode escapes */
 uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
 /* error rule to avoid backup */
-uescapefail		("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
 
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}
-xuistop1		{dquote}{whitespace}*{uescapefail}?
-xuistop2		{dquote}{whitespace}*{uescape}
 
 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}
-xusstop1		{quote}{whitespace}*{uescapefail}?
-xusstop2		{quote}{whitespace}*{uescape}
+
+/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
+xustop1		{uescapefail}?
+xustop2		{uescape}
 
 /* error rule to avoid backup */
 xufailed		[uU]&
@@ -536,15 +541,31 @@ other			.
 					yylval->str = litbufdup(yyscanner);
 					return SCONST;
 				}
-<xus>{xusstop1} {
+<xus>{quotestop} |
+<xus>{quotefail} {
 					/* throw back all but the quote */
 					yyless(1);
+					/* handle possible UESCAPE in xusend mode */
+					BEGIN(xusend);
+				}
+<xusend>{whitespace}
+<xusend>{other} |
+<xusend>{xustop1} {
+					/* no UESCAPE after the quote, throw back everything */
+					yyless(0);
 					BEGIN(INITIAL);
 					yylval->str = litbuf_udeescape('\\', yyscanner);
 					return SCONST;
 				}
-<xus>{xusstop2} {
+<xusend>{xustop2} {
+					/* found UESCAPE after the end quote */
 					BEGIN(INITIAL);
+					if (!check_uescapechar(yytext[yyleng-2]))
+					{
+						SET_YYLLOC();
+						ADVANCE_YYLLOC(yyleng-2);
+						yyerror("invalid Unicode escape character");
+					}
 					yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
 					return SCONST;
 				}
@@ -702,9 +723,19 @@ other			.
 					yylval->str = ident;
 					return IDENT;
 				}
-<xui>{xuistop1}	{
+<xui>{dquote} {
+					yyless(1);
+					/* handle possible UESCAPE in xuiend mode */
+					BEGIN(xuiend);
+				}
+<xuiend>{whitespace} { }
+<xuiend>{other} |
+<xuiend>{xustop1} {
+					/* no UESCAPE after the quote, throw back everything */
 					char		   *ident;
 
+					yyless(0);
+
 					BEGIN(INITIAL);
 					if (yyextra->literallen == 0)
 						yyerror("zero-length delimited identifier");
@@ -712,16 +743,21 @@ other			.
 					if (yyextra->literallen >= NAMEDATALEN)
 						truncate_identifier(ident, yyextra->literallen, true);
 					yylval->str = ident;
-					/* throw back all but the quote */
-					yyless(1);
 					return IDENT;
 				}
-<xui>{xuistop2}	{
+<xuiend>{xustop2}	{
+					/* found UESCAPE after the end quote */
 					char		   *ident;
 
 					BEGIN(INITIAL);
 					if (yyextra->literallen == 0)
 						yyerror("zero-length delimited identifier");
+					if (!check_uescapechar(yytext[yyleng-2]))
+					{
+						SET_YYLLOC();
+						ADVANCE_YYLLOC(yyleng-2);
+						yyerror("invalid Unicode escape character");
+					}
 					ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
 					if (yyextra->literallen >= NAMEDATALEN)
 						truncate_identifier(ident, yyextra->literallen, true);
@@ -1203,22 +1239,29 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner)
 	addlit(buf, pg_mblen(buf), yyscanner);
 }
 
-static char *
-litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
 {
-	char *new;
-	char *litbuf, *in, *out;
-	pg_wchar pair_first = 0;
-
 	if (isxdigit(escape)
 		|| escape == '+'
 		|| escape == '\''
 		|| escape == '"'
 		|| scanner_isspace(escape))
 	{
-		ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1);
-		yyerror("invalid Unicode escape character");
+		return false;
 	}
+	else
+		return true;
+}
+
+/* like litbufdup, but handle unicode escapes */
+static char *
+litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
+{
+	char *new;
+	char *litbuf, *in, *out;
+	pg_wchar pair_first = 0;
 
 	/* Make literalbuf null-terminated to simplify the scanning loop */
 	litbuf = yyextra->literalbuf;