summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Munro2018-09-01 19:12:24 +0000
committerThomas Munro2018-09-01 19:12:24 +0000
commit5e8d670c313531c0dca245943fb84c94a477ddc4 (patch)
tree73767cbc33f94e6d14f1d7a0fe4aa42bdb707a6b
parentec74369931687885cfb6ce9dac55deefdb410086 (diff)
Add Greek characters to unaccent.rules.
Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py19
-rw-r--r--contrib/unaccent/unaccent.rules221
2 files changed, 236 insertions, 4 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index 4b1b011861..859cac40fa 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -29,6 +29,15 @@ import argparse
import sys
import xml.etree.ElementTree as ET
+# The ranges of Unicode characters that we consider to be "plain letters".
+# For now we are being conservative by including only Latin and Greek. This
+# could be extended in future based on feedback from people with relevant
+# language knowledge.
+PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
+ (ord('A'), ord('Z')), # Latin upper case
+ (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
+ (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+
def print_record(codepoint, letter):
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@@ -39,9 +48,11 @@ class Codepoint:
self.combining_ids = combining_ids
def is_plain_letter(codepoint):
- """Return true if codepoint represents a plain ASCII letter."""
- return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
- (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+ """Return true if codepoint represents a "plain letter"."""
+ for begin, end in PLAIN_LETTER_RANGES:
+ if codepoint.id >= begin and codepoint.id <= end:
+ return True
+ return False
def is_mark(codepoint):
"""Returns true for diacritical marks (combining codepoints)."""
@@ -184,7 +195,7 @@ def main(args):
len(codepoint.combining_ids) > 1:
if is_letter_with_marks(codepoint, table):
charactersSet.add((codepoint.id,
- chr(get_plain_letter(codepoint, table).id)))
+ unichr(get_plain_letter(codepoint, table).id)))
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
charactersSet.add((codepoint.id,
"".join(unichr(combining_codepoint.id)
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 97f9ed47cf..76e4e69beb 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -399,6 +399,26 @@
ʦ ts
ʪ ls
ʫ lz
+Ά Α
+Έ Ε
+Ή Η
+Ί Ι
+Ό Ο
+Ύ Υ
+Ώ Ω
+ΐ ι
+Ϊ Ι
+Ϋ Υ
+ά α
+έ ε
+ή η
+ί ι
+ΰ υ
+ϊ ι
+ϋ υ
+ό ο
+ύ υ
+ώ ω
Ё Е
ё е
ᴀ A
@@ -709,6 +729,207 @@
ỽ v
Ỿ Y
ỿ y
+ἀ α
+ἁ α
+ἂ α
+ἃ α
+ἄ α
+ἅ α
+ἆ α
+ἇ α
+Ἀ Α
+Ἁ Α
+Ἂ Α
+Ἃ Α
+Ἄ Α
+Ἅ Α
+Ἆ Α
+Ἇ Α
+ἐ ε
+ἑ ε
+ἒ ε
+ἓ ε
+ἔ ε
+ἕ ε
+Ἐ Ε
+Ἑ Ε
+Ἒ Ε
+Ἓ Ε
+Ἔ Ε
+Ἕ Ε
+ἠ η
+ἡ η
+ἢ η
+ἣ η
+ἤ η
+ἥ η
+ἦ η
+ἧ η
+Ἠ Η
+Ἡ Η
+Ἢ Η
+Ἣ Η
+Ἤ Η
+Ἥ Η
+Ἦ Η
+Ἧ Η
+ἰ ι
+ἱ ι
+ἲ ι
+ἳ ι
+ἴ ι
+ἵ ι
+ἶ ι
+ἷ ι
+Ἰ Ι
+Ἱ Ι
+Ἲ Ι
+Ἳ Ι
+Ἴ Ι
+Ἵ Ι
+Ἶ Ι
+Ἷ Ι
+ὀ ο
+ὁ ο
+ὂ ο
+ὃ ο
+ὄ ο
+ὅ ο
+Ὀ Ο
+Ὁ Ο
+Ὂ Ο
+Ὃ Ο
+Ὄ Ο
+Ὅ Ο
+ὐ υ
+ὑ υ
+ὒ υ
+ὓ υ
+ὔ υ
+ὕ υ
+ὖ υ
+ὗ υ
+Ὑ Υ
+Ὓ Υ
+Ὕ Υ
+Ὗ Υ
+ὠ ω
+ὡ ω
+ὢ ω
+ὣ ω
+ὤ ω
+ὥ ω
+ὦ ω
+ὧ ω
+Ὠ Ω
+Ὡ Ω
+Ὢ Ω
+Ὣ Ω
+Ὤ Ω
+Ὥ Ω
+Ὦ Ω
+Ὧ Ω
+ὰ α
+ὲ ε
+ὴ η
+ὶ ι
+ὸ ο
+ὺ υ
+ὼ ω
+ᾀ α
+ᾁ α
+ᾂ α
+ᾃ α
+ᾄ α
+ᾅ α
+ᾆ α
+ᾇ α
+ᾈ Α
+ᾉ Α
+ᾊ Α
+ᾋ Α
+ᾌ Α
+ᾍ Α
+ᾎ Α
+ᾏ Α
+ᾐ η
+ᾑ η
+ᾒ η
+ᾓ η
+ᾔ η
+ᾕ η
+ᾖ η
+ᾗ η
+ᾘ Η
+ᾙ Η
+ᾚ Η
+ᾛ Η
+ᾜ Η
+ᾝ Η
+ᾞ Η
+ᾟ Η
+ᾠ ω
+ᾡ ω
+ᾢ ω
+ᾣ ω
+ᾤ ω
+ᾥ ω
+ᾦ ω
+ᾧ ω
+ᾨ Ω
+ᾩ Ω
+ᾪ Ω
+ᾫ Ω
+ᾬ Ω
+ᾭ Ω
+ᾮ Ω
+ᾯ Ω
+ᾰ α
+ᾱ α
+ᾲ α
+ᾳ α
+ᾴ α
+ᾶ α
+ᾷ α
+Ᾰ Α
+Ᾱ Α
+Ὰ Α
+ᾼ Α
+ῂ η
+ῃ η
+ῄ η
+ῆ η
+ῇ η
+Ὲ Ε
+Ὴ Η
+ῌ Η
+ῐ ι
+ῑ ι
+ῒ ι
+ῖ ι
+ῗ ι
+Ῐ Ι
+Ῑ Ι
+Ὶ Ι
+ῠ υ
+ῡ υ
+ῢ υ
+ῤ ρ
+ῥ ρ
+ῦ υ
+ῧ υ
+Ῠ Υ
+Ῡ Υ
+Ὺ Υ
+Ῥ Ρ
+ῲ ω
+ῳ ω
+ῴ ω
+ῶ ω
+ῷ ω
+Ὸ Ο
+Ὼ Ω
+ῼ Ω
‐ -
‑ -
‒ -