Change type "char"'s I/O format for non-ASCII characters.

Previously, a byte with the high bit set was just transmitted as-is by charin() and charout(). This is problematic if the database encoding is multibyte, because the result of charout() won't be validly encoded, which breaks various stuff that expects all text strings to be validly encoded. We've previously decided to enforce encoding validity rather than try to individually harden each place that might have a problem with such strings, so it's time to do something about "char". To fix, represent high-bit-set characters as \ooo (backslash and three octal digits), following the ancient "escape" format for bytea. charin() will continue to accept the old way as well, though that is only reachable in single-byte encodings. Add some test cases just so there is coverage for this code. We'll otherwise leave this question undocumented as it was before, because we don't really want to encourage end-user use of "char". For the moment, back-patch into v15 so that this change appears in 15beta3. If there's not great pushback we should consider absorbing this change into the older branches. Discussion: https://postgr.es/m/[email protected]
author: Tom Lane 2022-08-02 14:29:35 +0000
committer: Tom Lane 2022-08-02 14:29:35 +0000
commit: ec62ce55a813db5c925d89a53b5b22baa509abb6 (patch)
tree: 382d4b8dd8c1e20245ba0210b803a5a5e99b4ba1
parent: 1349d2790bf48a4de072931c722f39337e72055e (diff)
6 files changed, 263 insertions, 28 deletions
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 8e30b82273c..4cc9e592708 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1338,9 +1338,10 @@ SELECT b, char_length(b) FROM test2;
    <para>
     There are two other fixed-length character types in
     <productname>PostgreSQL</productname>, shown in <xref
-    linkend="datatype-character-special-table"/>. The <type>name</type>
-    type exists <emphasis>only</emphasis> for the storage of identifiers
-    in the internal system catalogs and is not intended for use by the general user. Its
+    linkend="datatype-character-special-table"/>.
+    These are not intended for general-purpose use, only for use
+    in the internal system catalogs.
+    The <type>name</type> type is used to store identifiers. Its
     length is currently defined as 64 bytes (63 usable characters plus
     terminator) but should be referenced using the constant
     <symbol>NAMEDATALEN</symbol> in <literal>C</literal> source code.
@@ -1348,7 +1349,8 @@ SELECT b, char_length(b) FROM test2;
     is therefore adjustable for special uses); the default maximum
     length might change in a future release. The type <type>"char"</type>
     (note the quotes) is different from <type>char(1)</type> in that it
-    only uses one byte of storage. It is internally used in the system
+    only uses one byte of storage, and therefore can store only a single
+    ASCII character. It is used in the system
     catalogs as a simplistic enumeration type.
    </para>
 
diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c
index 0df41c22538..e50293bf14c 100644
--- a/src/backend/utils/adt/char.c
+++ b/src/backend/utils/adt/char.c
@@ -20,6 +20,11 @@
 #include "libpq/pqformat.h"
 #include "utils/builtins.h"
 
+#define ISOCTAL(c)   (((c) >= '0') && ((c) <= '7'))
+#define TOOCTAL(c)   ((c) + '0')
+#define FROMOCTAL(c) ((unsigned char) (c) - '0')
+
+
 /*****************************************************************************
  *	 USER I/O ROUTINES														 *
  *****************************************************************************/
@@ -27,31 +32,53 @@
 /*
  *		charin			- converts "x" to 'x'
  *
- * Note that an empty input string will implicitly be converted to \0.
+ * This accepts the formats charout produces.  If we have multibyte input
+ * that is not in the form '\ooo', then we take its first byte as the value
+ * and silently discard the rest; this is a backwards-compatibility provision.
  */
 Datum
 charin(PG_FUNCTION_ARGS)
 {
 	char	   *ch = PG_GETARG_CSTRING(0);
 
+	if (strlen(ch) == 4 && ch[0] == '\\' &&
+		ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+		PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
+					   (FROMOCTAL(ch[2]) << 3) +
+					   FROMOCTAL(ch[3]));
+	/* This will do the right thing for a zero-length input string */
 	PG_RETURN_CHAR(ch[0]);
 }
 
 /*
  *		charout			- converts 'x' to "x"
  *
- * Note that if the char value is \0, the resulting string will appear
- * to be empty (null-terminated after zero characters).  So this is the
- * inverse of the charin() function for such data.
+ * The possible output formats are:
+ * 1. 0x00 is represented as an empty string.
+ * 2. 0x01..0x7F are represented as a single ASCII byte.
+ * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
+ * Case 3 is meant to match the traditional "escape" format of bytea.
  */
 Datum
 charout(PG_FUNCTION_ARGS)
 {
 	char		ch = PG_GETARG_CHAR(0);
-	char	   *result = (char *) palloc(2);
+	char	   *result = (char *) palloc(5);
 
-	result[0] = ch;
-	result[1] = '\0';
+	if (IS_HIGHBIT_SET(ch))
+	{
+		result[0] = '\\';
+		result[1] = TOOCTAL(((unsigned char) ch) >> 6);
+		result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
+		result[3] = TOOCTAL(((unsigned char) ch) & 07);
+		result[4] = '\0';
+	}
+	else
+	{
+		/* This produces acceptable results for 0x00 as well */
+		result[0] = ch;
+		result[1] = '\0';
+	}
 	PG_RETURN_CSTRING(result);
 }
 
@@ -176,15 +203,20 @@ Datum
 text_char(PG_FUNCTION_ARGS)
 {
 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
+	char	   *ch = VARDATA_ANY(arg1);
 	char		result;
 
 	/*
-	 * An empty input string is converted to \0 (for consistency with charin).
-	 * If the input is longer than one character, the excess data is silently
-	 * discarded.
+	 * Conversion rules are the same as in charin(), but here we need to
+	 * handle the empty-string case honestly.
 	 */
-	if (VARSIZE_ANY_EXHDR(arg1) > 0)
-		result = *(VARDATA_ANY(arg1));
+	if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
+		ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+		result = (FROMOCTAL(ch[1]) << 6) +
+			(FROMOCTAL(ch[2]) << 3) +
+			FROMOCTAL(ch[3]);
+	else if (VARSIZE_ANY_EXHDR(arg1) > 0)
+		result = ch[0];
 	else
 		result = '\0';
 
@@ -195,13 +227,21 @@ Datum
 char_text(PG_FUNCTION_ARGS)
 {
 	char		arg1 = PG_GETARG_CHAR(0);
-	text	   *result = palloc(VARHDRSZ + 1);
+	text	   *result = palloc(VARHDRSZ + 4);
 
 	/*
-	 * Convert \0 to an empty string, for consistency with charout (and
-	 * because the text stuff doesn't like embedded nulls all that well).
+	 * Conversion rules are the same as in charout(), but here we need to be
+	 * honest about converting 0x00 to an empty string.
 	 */
-	if (arg1 != '\0')
+	if (IS_HIGHBIT_SET(arg1))
+	{
+		SET_VARSIZE(result, VARHDRSZ + 4);
+		(VARDATA(result))[0] = '\\';
+		(VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
+		(VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
+		(VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
+	}
+	else if (arg1 != '\0')
 	{
 		SET_VARSIZE(result, VARHDRSZ + 1);
 		*(VARDATA(result)) = arg1;
diff --git a/src/test/regress/expected/char.out b/src/test/regress/expected/char.out
index 2d78f90f3b9..ea9b0b8eeb3 100644
--- a/src/test/regress/expected/char.out
+++ b/src/test/regress/expected/char.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_1.out b/src/test/regress/expected/char_1.out
index fa6644d6927..ffd31551de5 100644
--- a/src/test/regress/expected/char_1.out
+++ b/src/test/regress/expected/char_1.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_2.out b/src/test/regress/expected/char_2.out
index 09434a44cdc..56818f824b5 100644
--- a/src/test/regress/expected/char_2.out
+++ b/src/test/regress/expected/char_2.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/sql/char.sql b/src/test/regress/sql/char.sql
index 9c83c45e340..120fed53e5c 100644
--- a/src/test/regress/sql/char.sql
+++ b/src/test/regress/sql/char.sql
@@ -2,8 +2,8 @@
 -- CHAR
 --
 
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 
 SELECT char 'c' = char 'c' AS true;
 
@@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL;
 INSERT INTO CHAR_TBL (f1) VALUES ('abcde');
 
 SELECT * FROM CHAR_TBL;
+
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+
+SELECT 'a'::"char";
+SELECT '\101'::"char";
+SELECT '\377'::"char";
+SELECT 'a'::"char"::text;
+SELECT '\377'::"char"::text;
+SELECT '\000'::"char"::text;
+SELECT 'a'::text::"char";
+SELECT '\377'::text::"char";
+SELECT ''::text::"char";
author	Tom Lane	2022-08-02 14:29:35 +0000
committer	Tom Lane	2022-08-02 14:29:35 +0000
commit	ec62ce55a813db5c925d89a53b5b22baa509abb6 (patch)
tree	382d4b8dd8c1e20245ba0210b803a5a5e99b4ba1
parent	1349d2790bf48a4de072931c722f39337e72055e (diff)