Update unaccent rules with release 34 of CLDR for Latin-ASCII.xml

michaelpq · michaelpq · commit e1c1d5444e43 · 2019-01-10T14:10:21.000+09:00
This has required an update of the python script generating the rules, as its format has changed in release 29. This release has also added new punctuation and symbols, and a new set of rules has been generated to include them. The way to find newest versions of Latin-ASCII gets also more clearly documented. Author: Hugh Ranalli, Michael Paquier Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
@@ -25,6 +25,12 @@ SELECT unaccent('ЁЖИК');
  ЕЖИК
 (1 row)
 
+SELECT unaccent('˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
 SELECT unaccent('unaccent', 'foobar');
  unaccent 
 ----------
@@ -43,6 +49,12 @@ SELECT unaccent('unaccent', 'ЁЖИК');
  ЕЖИК
 (1 row)
 
+SELECT unaccent('unaccent', '˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
 SELECT ts_lexize('unaccent', 'foobar');
  ts_lexize 
 -----------
@@ -61,3 +73,9 @@ SELECT ts_lexize('unaccent', 'ЁЖИК');
  {ЕЖИК}
 (1 row)
 
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+ ts_lexize 
+-----------
+ {>+-~}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
@@ -20,8 +20,13 @@
 # option is enabled, the XML file of this transliterator [2] -- given as a
 # command line argument -- will be parsed and used.
 #
+# Ideally you should use the latest release for each data set.  For
+# Latin-ASCII.xml, the latest data sets released can be browsed directly
+# via [3].  Note that this script is compatible with at least release 29.
+#
 # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
-# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
+# [3] https://unicode.org/cldr/trac/browser/tags
 
 # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
 # The approach is to be Python3 compatible with Python2 "backports".
@@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
     transliterationTree = ET.parse(latinAsciiFilePath)
     transliterationTreeRoot = transliterationTree.getroot()
 
-    for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
-        matches = rulePattern.search(rule.text)
+    # Fetch all the transliteration rules.  Since release 29 of Latin-ASCII.xml
+    # all the transliteration rules are located in a single tRule block with
+    # all rules separated into separate lines.
+    blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
+    assert(len(blockRules) == 1)
+
+    # Split the block of rules into one element per line.
+    rules = blockRules[0].text.splitlines()
+
+    # And finish the processing of each individual rule.
+    for rule in rules:
+        matches = rulePattern.search(rule)
 
         # The regular expression capture four groups corresponding
         # to the characters.
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
@@ -8,11 +8,14 @@ SET client_encoding TO 'UTF8';
 SELECT unaccent('foobar');
 SELECT unaccent('ёлка');
 SELECT unaccent('ЁЖИК');
+SELECT unaccent('˃˖˗˜');
 
 SELECT unaccent('unaccent', 'foobar');
 SELECT unaccent('unaccent', 'ёлка');
 SELECT unaccent('unaccent', 'ЁЖИК');
+SELECT unaccent('unaccent', '˃˖˗˜');
 
 SELECT ts_lexize('unaccent', 'foobar');
 SELECT ts_lexize('unaccent', 'ёлка');
 SELECT ts_lexize('unaccent', 'ЁЖИК');
+SELECT ts_lexize('unaccent', '˃˖˗˜');
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
@@ -399,6 +399,21 @@
 ʦ	ts
 ʪ	ls
 ʫ	lz
+ʹ	'
+ʺ	"
+ʻ	'
+ʼ	'
+ʽ	'
+˂	<
+˃	>
+˄	^
+ˆ	^
+ˈ	'
+ˋ	`
+ː	:
+˖	+
+˗	-
+˜	~
 Ά	Α
 Έ	Ε
 Ή	Η

-Original file line number
+Diff line change
 ʦ	ts
 ʪ	ls
 ʫ	lz
 +ʹ	'
 +ʺ	"
 +ʻ	'
 +ʼ	'
 +ʽ	'
 +˂	<
 +˃	>
 +˄	^
 +ˆ	^
 +ˈ	'
 +ˋ	`
 +ː	:
 +˖	+
 +˗	-
 +˜	~
 Ά	Α
 Έ	Ε
 Ή	Η