summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeodor Sigaev2015-09-04 09:51:53 +0000
committerTeodor Sigaev2015-09-04 09:51:53 +0000
commit1bbd52cb9a4aa61a7dd751f5d1f7b44650d6122a (patch)
treecb52d878702e901529ce383c60ded775c7c76435
parent4aec49899e5782247e134f94ce1c6ee926f88e1c (diff)
Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
Add Python script for buiding unaccent.rules from Unicode data. Don't backpatch because unaccent changes may require tsvector/index rebuild. Thomas Munro <[email protected]>
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py123
-rw-r--r--contrib/unaccent/unaccent.rules358
2 files changed, 415 insertions, 66 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644
index 0000000000..b838d8f630
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input. Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+ print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+ def __init__(self, id, general_category, combining_ids):
+ self.id = id
+ self.general_category = general_category
+ self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+ """Return true if codepoint represents a plain ASCII letter."""
+ return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+ (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+ """Returns true for diacritical marks (combining codepoints)."""
+ return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+ """Returns true for plain letters combined with one or more marks."""
+ # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+ return len(codepoint.combining_ids) > 1 and \
+ is_plain_letter(table[codepoint.combining_ids[0]]) and \
+ all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+ """Return true for letter with or without diacritical marks."""
+ return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+ """Return the base codepoint without marks."""
+ if is_letter_with_marks(codepoint, table):
+ return table[codepoint.combining_ids[0]]
+ elif is_plain_letter(codepoint):
+ return codepoint
+ else:
+ raise "mu"
+
+def is_ligature(codepoint, table):
+ """Return true for letters combined with letters."""
+ return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+ """Return a list of plain letters from a ligature."""
+ assert(is_ligature(codepoint, table))
+ return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+ # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+ decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+ table = {}
+ all = []
+
+ # read everything we need into memory
+ for line in sys.stdin.readlines():
+ fields = line.split(";")
+ if len(fields) > 5:
+ # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+ general_category = fields[2]
+ decomposition = fields[5]
+ decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+ id = int(fields[0], 16)
+ combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+ codepoint = Codepoint(id, general_category, combining_ids)
+ table[id] = codepoint
+ all.append(codepoint)
+
+ # walk through all the codepoints looking for interesting mappings
+ for codepoint in all:
+ if codepoint.general_category.startswith('L') and \
+ len(codepoint.combining_ids) > 1:
+ if is_letter_with_marks(codepoint, table):
+ print_record(codepoint.id,
+ chr(get_plain_letter(codepoint, table).id))
+ elif expand_ligatures and is_ligature(codepoint, table):
+ print_record(codepoint.id,
+ "".join(unichr(combining_codepoint.id)
+ for combining_codepoint \
+ in get_plain_letters(codepoint, table)))
+
+ # some special cases
+ print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+ print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+ print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+ print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+ print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+ print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+ print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+ print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+ print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+ print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+ print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+ print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+ print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+ print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+ if expand_ligatures:
+ print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+ print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+ print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+ print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+ print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+ main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index cc2f7a6585..73c24a188b 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -4,22 +4,59 @@
à A
Ä A
Å A
-Æ A
+Ç C
+È E
+É E
+Ê E
+Ë E
+Ì I
+Í I
+Î I
+Ï I
+Ñ N
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+Ù U
+Ú U
+Û U
+Ü U
+Ý Y
à a
á a
â a
ã a
ä a
å a
-æ a
+ç c
+è e
+é e
+ê e
+ë e
+ì i
+í i
+î i
+ï i
+ñ n
+ò o
+ó o
+ô o
+õ o
+ö o
+ù u
+ú u
+û u
+ü u
+ý y
+ÿ y
Ā A
ā a
Ă A
ă a
Ą A
ą a
-Ç C
-ç c
Ć C
ć c
Ĉ C
@@ -30,16 +67,6 @@
č c
Ď D
ď d
-Đ D
-đ d
-È E
-É E
-Ê E
-Ë E
-è e
-é e
-ê e
-ë e
Ē E
ē e
Ĕ E
@@ -60,17 +87,7 @@
ģ g
Ĥ H
ĥ h
-Ħ H
-ħ h
Ĩ I
-Ì I
-Í I
-Î I
-Ï I
-ì i
-í i
-î i
-ï i
ĩ i
Ī I
ī i
@@ -79,62 +96,36 @@
Į I
į i
İ I
-ı i
-IJ I
-ij i
+IJ IJ
+ij ij
Ĵ J
ĵ j
Ķ K
ķ k
-ĸ k
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
-Ŀ L
-ŀ l
-Ł L
-ł l
-Ñ N
-ñ n
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
-ʼn n
-Ŋ N
-ŋ n
-Ò O
-Ó O
-Ô O
-Õ O
-Ö O
-ò o
-ó o
-ô o
-õ o
-ö o
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
-Œ E
-œ e
-Ø O
-ø o
Ŕ R
ŕ r
Ŗ R
ŗ r
Ř R
ř r
-ß S
Ś S
ś s
Ŝ S
@@ -147,16 +138,6 @@
ţ t
Ť T
ť t
-Ŧ T
-ŧ t
-Ù U
-Ú U
-Û U
-Ü U
-ù u
-ú u
-û u
-ü u
Ũ U
ũ u
Ū U
@@ -171,9 +152,6 @@
ų u
Ŵ W
ŵ w
-Ý Y
-ý y
-ÿ y
Ŷ Y
ŷ y
Ÿ Y
@@ -183,5 +161,253 @@
ż z
Ž Z
ž z
-ё е
+Ơ O
+ơ o
+Ư U
+ư u
+DŽ DZ
+Dž Dz
+dž dz
+LJ LJ
+Lj Lj
+lj lj
+NJ NJ
+Nj Nj
+nj nj
+Ǎ A
+ǎ a
+Ǐ I
+ǐ i
+Ǒ O
+ǒ o
+Ǔ U
+ǔ u
+Ǧ G
+ǧ g
+Ǩ K
+ǩ k
+Ǫ O
+ǫ o
+ǰ j
+DZ DZ
+Dz Dz
+dz dz
+Ǵ G
+ǵ g
+Ǹ N
+ǹ n
+Ȁ A
+ȁ a
+Ȃ A
+ȃ a
+Ȅ E
+ȅ e
+Ȇ E
+ȇ e
+Ȉ I
+ȉ i
+Ȋ I
+ȋ i
+Ȍ O
+ȍ o
+Ȏ O
+ȏ o
+Ȑ R
+ȑ r
+Ȓ R
+ȓ r
+Ȕ U
+ȕ u
+Ȗ U
+ȗ u
+Ș S
+ș s
+Ț T
+ț t
+Ȟ H
+ȟ h
+Ȧ A
+ȧ a
+Ȩ E
+ȩ e
+Ȯ O
+ȯ o
+Ȳ Y
+ȳ y
+Ḁ A
+ḁ a
+Ḃ B
+ḃ b
+Ḅ B
+ḅ b
+Ḇ B
+ḇ b
+Ḋ D
+ḋ d
+Ḍ D
+ḍ d
+Ḏ D
+ḏ d
+Ḑ D
+ḑ d
+Ḓ D
+ḓ d
+Ḙ E
+ḙ e
+Ḛ E
+ḛ e
+Ḟ F
+ḟ f
+Ḡ G
+ḡ g
+Ḣ H
+ḣ h
+Ḥ H
+ḥ h
+Ḧ H
+ḧ h
+Ḩ H
+ḩ h
+Ḫ H
+ḫ h
+Ḭ I
+ḭ i
+Ḱ K
+ḱ k
+Ḳ K
+ḳ k
+Ḵ K
+ḵ k
+Ḷ L
+ḷ l
+Ḻ L
+ḻ l
+Ḽ L
+ḽ l
+Ḿ M
+ḿ m
+Ṁ M
+ṁ m
+Ṃ M
+ṃ m
+Ṅ N
+ṅ n
+Ṇ N
+ṇ n
+Ṉ N
+ṉ n
+Ṋ N
+ṋ n
+Ṕ P
+ṕ p
+Ṗ P
+ṗ p
+Ṙ R
+ṙ r
+Ṛ R
+ṛ r
+Ṟ R
+ṟ r
+Ṡ S
+ṡ s
+Ṣ S
+ṣ s
+Ṫ T
+ṫ t
+Ṭ T
+ṭ t
+Ṯ T
+ṯ t
+Ṱ T
+ṱ t
+Ṳ U
+ṳ u
+Ṵ U
+ṵ u
+Ṷ U
+ṷ u
+Ṽ V
+ṽ v
+Ṿ V
+ṿ v
+Ẁ W
+ẁ w
+Ẃ W
+ẃ w
+Ẅ W
+ẅ w
+Ẇ W
+ẇ w
+Ẉ W
+ẉ w
+Ẋ X
+ẋ x
+Ẍ X
+ẍ x
+Ẏ Y
+ẏ y
+Ẑ Z
+ẑ z
+Ẓ Z
+ẓ z
+Ẕ Z
+ẕ z
+ẖ h
+ẗ t
+ẘ w
+ẙ y
+Ạ A
+ạ a
+Ả A
+ả a
+Ẹ E
+ẹ e
+Ẻ E
+ẻ e
+Ẽ E
+ẽ e
+Ỉ I
+ỉ i
+Ị I
+ị i
+Ọ O
+ọ o
+Ỏ O
+ỏ o
+Ụ U
+ụ u
+Ủ U
+ủ u
+Ỳ Y
+ỳ y
+Ỵ Y
+ỵ y
+Ỷ Y
+ỷ y
+Ỹ Y
+ỹ y
+ff ff
+fi fi
+fl fl
+ffi ffi
+ffl ffl
+st st
+Ø O
+ø o
+Đ D
+đ d
+ı i
+Ħ H
+ħ h
+Ł L
+ł l
+ʼn 'n
+Ŧ T
+ŧ t
Ё Е
+ё е
+Æ AE
+ß ss
+æ ae
+Œ OE
+œ oe