From 604737aff7fb805ec91e283f0c8b9a257f9039bb Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 21 Aug 2023 16:06:42 +0900
Subject: [PATCH] unaccent: Add support for quoted translated characters

---
 doc/src/sgml/unaccent.sgml                  | 16 +++++
 contrib/unaccent/Makefile                   |  2 +-
 contrib/unaccent/custom_unaccent.rules      |  6 ++
 contrib/unaccent/expected/unaccent.out      | 74 ++++++++++++++++++++
 contrib/unaccent/generate_unaccent_rules.py |  4 ++
 contrib/unaccent/sql/unaccent.sql           | 15 +++++
 contrib/unaccent/unaccent--1.1.sql          |  5 ++
 contrib/unaccent/unaccent.c                 | 75 ++++++++++++++++++---
 contrib/unaccent/unaccent.rules             | 56 +++++++--------
 9 files changed, 216 insertions(+), 37 deletions(-)
 create mode 100644 contrib/unaccent/custom_unaccent.rules

diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml
index f3ddc64bbc..94100ed260 100644
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -84,6 +84,22 @@
     </para>
    </listitem>
 
+   <listitem>
+    <para>
+     Some characters, like numeric symbols, may require whitespaces in their
+     translation rule. It is possible to use double quotes around the translated
+     characters in this case. A double quote needs to be escaped with a second
+     double quote when including one in the translated character. For example:
+<programlisting>
+&frac14;      " 1/4"
+&frac12;      " 1/2"
+&frac34;      " 3/4"
+&ldquo;       """"
+&rdquo;       """"
+</programlisting>
+    </para>
+   </listitem>
+
    <listitem>
     <para>
      As with other <productname>PostgreSQL</productname> text search configuration files,
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
index 652a3e774c..27c7b2ca6e 100644
--- a/contrib/unaccent/Makefile
+++ b/contrib/unaccent/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 
 EXTENSION = unaccent
 DATA = unaccent--1.1.sql unaccent--1.0--1.1.sql
-DATA_TSEARCH = unaccent.rules
+DATA_TSEARCH = unaccent.rules custom_unaccent.rules
 PGFILEDESC = "unaccent - text search dictionary that removes accents"
 
 REGRESS = unaccent
diff --git a/contrib/unaccent/custom_unaccent.rules b/contrib/unaccent/custom_unaccent.rules
new file mode 100644
index 0000000000..d8791dd902
--- /dev/null
+++ b/contrib/unaccent/custom_unaccent.rules
@@ -0,0 +1,6 @@
+¼	" ""1/4""   "
+½	"""1/2""    "
+¾	"  ""3/4"""
+ʺ	" a  """" "
+“	" t  """
+”	""" b  "
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
index f080707c4a..64402010ce 100644
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -51,6 +51,18 @@ SELECT unaccent('℗'); -- sound recording copyright
  (P)
 (1 row)
 
+SELECT unaccent('1½'); -- math expression with whitespace
+ unaccent 
+----------
+ 1 1/2
+(1 row)
+
+SELECT unaccent('〝'); -- quote
+ unaccent 
+----------
+ "
+(1 row)
+
 SELECT unaccent('unaccent', 'foobar');
  unaccent 
 ----------
@@ -93,6 +105,56 @@ SELECT unaccent('unaccent', '℗');
  (P)
 (1 row)
 
+SELECT unaccent('unaccent', '1½');
+ unaccent 
+----------
+ 1 1/2
+(1 row)
+
+SELECT unaccent('unaccent', '〝');
+ unaccent 
+----------
+ "
+(1 row)
+
+-- XXX: Remove later.
+-- Just for the sake of checking the parsing logic.
+SELECT unaccent('custom_unaccent', '1¼');
+  unaccent  
+------------
+ 1 "1/4"   
+(1 row)
+
+SELECT unaccent('custom_unaccent', '1½');
+  unaccent  
+------------
+ 1"1/2"    
+(1 row)
+
+SELECT unaccent('custom_unaccent', '1¾');
+ unaccent 
+----------
+ 1  "3/4"
+(1 row)
+
+SELECT unaccent('custom_unaccent', 'ʺ');
+ unaccent 
+----------
+  a  "" 
+(1 row)
+
+SELECT unaccent('custom_unaccent', '“');
+ unaccent 
+----------
+  t  "
+(1 row)
+
+SELECT unaccent('custom_unaccent', '”');
+ unaccent 
+----------
+ " b  
+(1 row)
+
 SELECT ts_lexize('unaccent', 'foobar');
  ts_lexize 
 -----------
@@ -135,6 +197,18 @@ SELECT ts_lexize('unaccent', '℗');
  {(P)}
 (1 row)
 
+SELECT ts_lexize('unaccent', '1½');
+ ts_lexize 
+-----------
+ {"1 1/2"}
+(1 row)
+
+SELECT ts_lexize('unaccent', '〝');
+ ts_lexize 
+-----------
+ {"\""}
+(1 row)
+
 -- Controversial case.  Black-Letter Capital H (U+210C) is translated by
 -- Latin-ASCII.xml as 'x', but it should be 'H'.
 SELECT unaccent('ℌ');
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index b4b4c38beb..cffb7db7ce 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -58,6 +58,10 @@ COMBINING_MARK_RANGES = ((0x0300, 0x0362),   # Mn: Accents, IPA
 
 def print_record(codepoint, letter):
     if letter:
+        # If the letter has whitespace or double quotes, escape double
+        # quotes and apply more quotes around it.
+        if (' ' in letter) or ('"' in letter):
+            letter = '"' + letter.replace('"', '""') + '"'
         output = chr(codepoint) + "\t" + letter
     else:
         output = chr(codepoint)
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
index 663646c1ac..9ef2fc5010 100644
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -20,6 +20,8 @@ SELECT unaccent('˃˖˗˜');
 SELECT unaccent('À');  -- Remove combining diacritical 0x0300
 SELECT unaccent('℃℉'); -- degree signs
 SELECT unaccent('℗'); -- sound recording copyright
+SELECT unaccent('1½'); -- math expression with whitespace
+SELECT unaccent('〝'); -- quote
 
 SELECT unaccent('unaccent', 'foobar');
 SELECT unaccent('unaccent', 'ёлка');
@@ -28,6 +30,17 @@ SELECT unaccent('unaccent', '˃˖˗˜');
 SELECT unaccent('unaccent', 'À');
 SELECT unaccent('unaccent', '℃℉');
 SELECT unaccent('unaccent', '℗');
+SELECT unaccent('unaccent', '1½');
+SELECT unaccent('unaccent', '〝');
+
+-- XXX: Remove later.
+-- Just for the sake of checking the parsing logic.
+SELECT unaccent('custom_unaccent', '1¼');
+SELECT unaccent('custom_unaccent', '1½');
+SELECT unaccent('custom_unaccent', '1¾');
+SELECT unaccent('custom_unaccent', 'ʺ');
+SELECT unaccent('custom_unaccent', '“');
+SELECT unaccent('custom_unaccent', '”');
 
 SELECT ts_lexize('unaccent', 'foobar');
 SELECT ts_lexize('unaccent', 'ёлка');
@@ -36,6 +49,8 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
 SELECT ts_lexize('unaccent', 'À');
 SELECT ts_lexize('unaccent', '℃℉');
 SELECT ts_lexize('unaccent', '℗');
+SELECT ts_lexize('unaccent', '1½');
+SELECT ts_lexize('unaccent', '〝');
 
 -- Controversial case.  Black-Letter Capital H (U+210C) is translated by
 -- Latin-ASCII.xml as 'x', but it should be 'H'.
diff --git a/contrib/unaccent/unaccent--1.1.sql b/contrib/unaccent/unaccent--1.1.sql
index ecc8651780..a821e1c37d 100644
--- a/contrib/unaccent/unaccent--1.1.sql
+++ b/contrib/unaccent/unaccent--1.1.sql
@@ -32,3 +32,8 @@ CREATE TEXT SEARCH DICTIONARY unaccent (
 	TEMPLATE = unaccent,
 	RULES    = 'unaccent'
 );
+
+CREATE TEXT SEARCH DICTIONARY custom_unaccent (
+	TEMPLATE = unaccent,
+	RULES    = 'custom_unaccent'
+);
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
index 64c879e547..75fb4032b2 100644
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -127,14 +127,16 @@ initTrie(const char *filename)
 				 * src and trg are sequences of one or more non-whitespace
 				 * characters, separated by whitespace.  Whitespace at start
 				 * or end of line is ignored.  If trg is omitted, an empty
-				 * string is used as the replacement.
+				 * string is used as the replacement.  trg can be optionally
+				 * quoted, in which case whitespaces are included in it.
 				 *
 				 * We use a simple state machine, with states
 				 *	0	initial (before src)
 				 *	1	in src
 				 *	2	in whitespace after src
-				 *	3	in trg
-				 *	4	in whitespace after trg
+				 *	3	in trg (non-quoted)
+				 *	4	in trg (quoted)
+				 *	5	in whitespace after trg
 				 *	-1	syntax error detected
 				 *----------
 				 */
@@ -142,9 +144,12 @@ initTrie(const char *filename)
 				char	   *ptr;
 				char	   *src = NULL;
 				char	   *trg = NULL;
+				char	   *trgstore = NULL;
 				int			ptrlen;
 				int			srclen = 0;
 				int			trglen = 0;
+				int			trgstorelen = 0;
+				bool		trgquoted = false;
 
 				state = 0;
 				for (ptr = line; *ptr; ptr += ptrlen)
@@ -156,8 +161,10 @@ initTrie(const char *filename)
 						if (state == 1)
 							state = 2;
 						else if (state == 3)
-							state = 4;
-						continue;
+							state = 5;
+						/* whitespaces are OK in quoted area */
+						if (state != 4)
+							continue;
 					}
 					switch (state)
 					{
@@ -173,14 +180,41 @@ initTrie(const char *filename)
 							break;
 						case 2:
 							/* start of trg */
+							if (*ptr == '"')
+							{
+								trgquoted = true;
+								state = 4;
+							}
+							else
+								state = 3;
+
 							trg = ptr;
 							trglen = ptrlen;
-							state = 3;
 							break;
 						case 3:
-							/* continue trg */
+							/* continue non-quoted trg */
 							trglen += ptrlen;
 							break;
+						case 4:
+							/* continue quoted trg */
+							trglen += ptrlen;
+
+							/*
+							 * If this is a quote, consider it as the end of
+							 * trg except if the follow-up character is itself
+							 * a quote.
+							 */
+							if (*ptr == '"')
+							{
+								if (*(ptr + 1) == '"')
+								{
+									ptr++;
+									trglen += 1;
+								}
+								else
+									state = 5;
+							}
+							break;
 						default:
 							/* bogus line format */
 							state = -1;
@@ -195,10 +229,35 @@ initTrie(const char *filename)
 					trglen = 0;
 				}
 
+				/* If still in a quoted area, fallback to an error */
+				if (state == 4)
+					state = -1;
+
+				/* If trg was quoted, remove its quotes and unescape it */
+				if (trgquoted)
+				{
+					/* Ignore first and end quotes */
+					trgstore = palloc0(sizeof(char *) * trglen - 2);
+					trgstorelen = 0;
+					for (int i = 1; i < trglen - 1; i++)
+					{
+						trgstore[trgstorelen] = trg[i];
+						trgstorelen++;
+						/* skip second double quotes */
+						if (trg[i] == '"' && trg[i + 1] == '"')
+							i++;
+					}
+				}
+				else
+				{
+					trgstore = trg;
+					trgstorelen = trglen;
+				}
+
 				if (state > 0)
 					rootTrie = placeChar(rootTrie,
 										 (unsigned char *) src, srclen,
-										 trg, trglen);
+										 trgstore, trgstorelen);
 				else if (state < 0)
 					ereport(WARNING,
 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 3030166ed6..ca6caa51f5 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -5,9 +5,9 @@
 ®	(R)
 ±	+/-
 »	>>
-¼	 1/4
-½	 1/2
-¾	 3/4
+¼	" 1/4"
+½	" 1/2"
+¾	" 3/4"
 ¿	?
 À	A
 Á	A
@@ -403,7 +403,7 @@
 ʪ	ls
 ʫ	lz
 ʹ	'
-ʺ	"
+ʺ	""""
 ʻ	'
 ʼ	'
 ʽ	'
@@ -1058,15 +1058,15 @@
 ’	'
 ‚	,
 ‛	'
-“	"
-”	"
+“	""""
+”	""""
 „	,,
-‟	"
+‟	""""
 ․	.
 ‥	..
 …	...
 ′	'
-″	"
+″	""""
 ‹	<
 ›	>
 ‼	!!
@@ -1134,22 +1134,22 @@
 ⅇ	e
 ⅈ	i
 ⅉ	j
-⅐	 1/7
-⅑	 1/9
-⅒	 1/10
-⅓	 1/3
-⅔	 2/3
-⅕	 1/5
-⅖	 2/5
-⅗	 3/5
-⅘	 4/5
-⅙	 1/6
-⅚	 5/6
-⅛	 1/8
-⅜	 3/8
-⅝	 5/8
-⅞	 7/8
-⅟	 1/
+⅐	" 1/7"
+⅑	" 1/9"
+⅒	" 1/10"
+⅓	" 1/3"
+⅔	" 2/3"
+⅕	" 1/5"
+⅖	" 2/5"
+⅗	" 3/5"
+⅘	" 4/5"
+⅙	" 1/6"
+⅚	" 5/6"
+⅛	" 1/8"
+⅜	" 3/8"
+⅝	" 5/8"
+⅞	" 7/8"
+⅟	" 1/"
 Ⅰ	I
 Ⅱ	II
 Ⅲ	III
@@ -1182,7 +1182,7 @@
 ⅽ	c
 ⅾ	d
 ⅿ	m
-↉	 0/3
+↉	" 0/3"
 −	-
 ∕	/
 ∖	\
@@ -1296,8 +1296,8 @@
 〙	]
 〚	[
 〛	]
-〝	"
-〞	"
+〝	""""
+〞	""""
 ㍱	hPa
 ㍲	da
 ㍳	AU
@@ -1512,7 +1512,7 @@
 ﹪	%
 ﹫	@
 ！	!
-＂	"
+＂	""""
 ＃	#
 ＄	$
 ％	%
-- 
2.40.1

