summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Eisentraut2009-09-21 22:22:07 +0000
committerPeter Eisentraut2009-09-21 22:22:07 +0000
commit21db7b93aa05dddf967447cd069dbb5e6d88eff2 (patch)
treea4547cea60117da5244b8f41aa016a01a3ec22ef
parent8b46154738f1ed78c00096bb757a8be800d8b244 (diff)
Surrogate pair support for U& string and identifier syntax
This is mainly to make the functionality consistent with the proposed \u escape syntax.
-rw-r--r--doc/src/sgml/syntax.sgml8
-rw-r--r--src/backend/parser/scan.l75
2 files changed, 79 insertions, 4 deletions
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index 7637eab58f..2e20b735d9 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -238,6 +238,10 @@ U&"d!0061t!+000061" UESCAPE '!'
The Unicode escape syntax works only when the server encoding is
UTF8. When other server encodings are used, only code points in
the ASCII range (up to <literal>\007F</literal>) can be specified.
+ Both the 4-digit and the 6-digit form can be used to specify
+ UTF-16 surrogate pairs to compose characters with code points
+ larger than <literal>\FFFF</literal> (although the availability of
+ the 6-digit form technically makes this unnecessary).
</para>
<para>
@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
UTF8. When other server encodings are used, only code points in
the ASCII range (up to <literal>\007F</literal>) can be
specified.
+ Both the 4-digit and the 6-digit form can be used to specify
+ UTF-16 surrogate pairs to compose characters with code points
+ larger than <literal>\FFFF</literal> (although the availability
+ of the 6-digit form technically makes this unnecessary).
</para>
<para>
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f404f9dc8b..4dcebe8f8d 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
}
}
+static bool
+is_utf16_surrogate_first(pg_wchar c)
+{
+ return (c >= 0xD800 && c <= 0xDBFF);
+}
+
+static bool
+is_utf16_surrogate_second(pg_wchar c)
+{
+ return (c >= 0xDC00 && c <= 0xDFFF);
+}
+
+static pg_wchar
+surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+{
+ return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+}
+
static char *
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{
char *new;
char *litbuf, *in, *out;
+ pg_wchar pair_first = 0;
if (isxdigit(escape)
|| escape == '+'
@@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{
if (in[1] == escape)
{
+ if (pair_first)
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
*out++ = escape;
in += 2;
}
@@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
check_unicode_value(unicode, in, yyscanner);
- unicode_to_utf8(unicode, (unsigned char *) out);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
+ }
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
in += 5;
- out += pg_mblen(out);
}
else if (in[1] == '+'
&& isxdigit(in[2]) && isxdigit(in[3])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
check_unicode_value(unicode, in, yyscanner);
- unicode_to_utf8(unicode, (unsigned char *) out);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
+ }
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
in += 8;
- out += pg_mblen(out);
}
else
{
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
}
}
else
+ {
+ if (pair_first)
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
*out++ = *in++;
+ }
}
*out = '\0';