*** pgsql/src/backend/parser/scan.l 2009/09/21 22:22:07 1.158 --- pgsql/src/backend/parser/scan.l 2009/09/22 23:52:53 1.159 *************** *** 24,30 **** * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $ * *------------------------------------------------------------------------- */ --- 24,30 ---- * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $ * *------------------------------------------------------------------------- */ *************** static void addlitchar(unsigned char ych *** 80,85 **** --- 80,88 ---- static char *litbufdup(base_yyscan_t yyscanner); static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner); + static bool is_utf16_surrogate_first(pg_wchar c); + static bool is_utf16_surrogate_second(pg_wchar c); + static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); #define yyerror(msg) scanner_yyerror(msg, yyscanner) *************** static void check_escape_warning(base_yy *** 97,102 **** --- 100,107 ---- extern int base_yyget_column(yyscan_t yyscanner); extern void base_yyset_column(int column_no, yyscan_t yyscanner); + static void addunicode(pg_wchar c, yyscan_t yyscanner); + %} %option reentrant *************** extern void base_yyset_column(int column *** 134,139 **** --- 139,145 ---- * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes + * Unicode surrogate pair in extended quoted string */ %x xb *************** extern void base_yyset_column(int column *** 145,150 **** --- 151,157 ---- %x xdolq %x xui %x xus + %x xeu /* * In order to make the world safe for Windows and Mac clients as well as *************** xeinside [^\\']+ *** 223,228 **** --- 230,237 ---- xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} + xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) + xeunicodebad [\\]([uU]) /* Extended quote * xqdouble implements embedded quote, '''' *************** other . *** 535,540 **** --- 544,588 ---- {xeinside} { addlit(yytext, yyleng, yyscanner); } + {xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + check_escape_warning(yyscanner); + + if (is_utf16_surrogate_first(c)) + { + yyextra->utf16_first_part = c; + BEGIN(xeu); + } + else if (is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + else + addunicode(c, yyscanner); + } + {xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + if (!is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + + c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c); + + addunicode(c, yyscanner); + + BEGIN(xe); + } + . | + \n | + <> { yyerror("invalid Unicode surrogate pair"); } + + {xeunicodebad} { + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."), + lexer_errposition())); + } + {xeescape} { if (yytext[1] == '\'') { *************** base_yyfree(void *ptr, base_yyscan_t yys *** 1330,1332 **** --- 1378,1398 ---- if (ptr) pfree(ptr); } + + static void + addunicode(pg_wchar c, base_yyscan_t yyscanner) + { + char buf[8]; + + if (c == 0 || c > 0x10FFFF) + yyerror("invalid Unicode escape value"); + if (c > 0x7F) + { + if (GetDatabaseEncoding() != PG_UTF8) + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + yyextra->saw_non_ascii = true; + } + unicode_to_utf8(c, (unsigned char *)buf); + addlit(buf, pg_mblen(buf), yyscanner); + } +