38
38
# For now we are being conservative by including only Latin and Greek. This
39
39
# could be extended in future based on feedback from people with relevant
40
40
# language knowledge.
41
- PLAIN_LETTER_RANGES = ((ord ('a' ), ord ('z' )), # Latin lower case
42
- (ord ('A' ), ord ('Z' )), # Latin upper case
43
- (0x03b1 , 0x03c9 ), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
44
- (0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
41
+ PLAIN_LETTER_RANGES = ((ord ('a' ), ord ('z' )), # Latin lower case
42
+ (ord ('A' ), ord ('Z' )), # Latin upper case
43
+ (0x03b1 , 0x03c9 ), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
44
+ (0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
45
45
46
46
# Combining marks follow a "base" character, and result in a composite
47
47
# character. Example: "U&'A\0300'"produces "À".There are three types of
51
51
# https://en.wikipedia.org/wiki/Combining_character
52
52
# https://www.unicode.org/charts/PDF/U0300.pdf
53
53
# https://www.unicode.org/charts/PDF/U20D0.pdf
54
- COMBINING_MARK_RANGES = ((0x0300 , 0x0362 ), # Mn: Accents, IPA
55
- (0x20dd , 0x20E0 ), # Me: Symbols
56
- (0x20e2 , 0x20e4 ),) # Me: Screen, keycap, triangle
54
+ COMBINING_MARK_RANGES = ((0x0300 , 0x0362 ), # Mn: Accents, IPA
55
+ (0x20dd , 0x20E0 ), # Me: Symbols
56
+ (0x20e2 , 0x20e4 ),) # Me: Screen, keycap, triangle
57
+
57
58
58
59
def print_record (codepoint , letter ):
59
60
if letter :
@@ -63,12 +64,14 @@ def print_record(codepoint, letter):
63
64
64
65
print (output )
65
66
67
+
66
68
class Codepoint :
67
69
def __init__ (self , id , general_category , combining_ids ):
68
70
self .id = id
69
71
self .general_category = general_category
70
72
self .combining_ids = combining_ids
71
73
74
+
72
75
def is_mark_to_remove (codepoint ):
73
76
"""Return true if this is a combining mark to remove."""
74
77
if not is_mark (codepoint ):
@@ -79,17 +82,20 @@ def is_mark_to_remove(codepoint):
79
82
return True
80
83
return False
81
84
85
+
82
86
def is_plain_letter (codepoint ):
83
87
"""Return true if codepoint represents a "plain letter"."""
84
88
for begin , end in PLAIN_LETTER_RANGES :
85
- if codepoint .id >= begin and codepoint .id <= end :
86
- return True
89
+ if codepoint .id >= begin and codepoint .id <= end :
90
+ return True
87
91
return False
88
92
93
+
89
94
def is_mark (codepoint ):
90
95
"""Returns true for diacritical marks (combining codepoints)."""
91
96
return codepoint .general_category in ("Mn" , "Me" , "Mc" )
92
97
98
+
93
99
def is_letter_with_marks (codepoint , table ):
94
100
"""Returns true for letters combined with one or more marks."""
95
101
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
@@ -105,16 +111,18 @@ def is_letter_with_marks(codepoint, table):
105
111
106
112
# Check if the base letter of this letter has marks.
107
113
codepoint_base = codepoint .combining_ids [0 ]
108
- if ( is_plain_letter (table [codepoint_base ]) is False and \
109
- is_letter_with_marks (table [codepoint_base ], table ) is False ) :
114
+ if is_plain_letter (table [codepoint_base ]) is False and \
115
+ is_letter_with_marks (table [codepoint_base ], table ) is False :
110
116
return False
111
117
112
118
return True
113
119
120
+
114
121
def is_letter (codepoint , table ):
115
122
"""Return true for letter with or without diacritical marks."""
116
123
return is_plain_letter (codepoint ) or is_letter_with_marks (codepoint , table )
117
124
125
+
118
126
def get_plain_letter (codepoint , table ):
119
127
"""Return the base codepoint without marks. If this codepoint has more
120
128
than one combining character, do a recursive lookup on the table to
@@ -133,15 +141,18 @@ def get_plain_letter(codepoint, table):
133
141
# Should not come here
134
142
assert (False )
135
143
144
+
136
145
def is_ligature (codepoint , table ):
137
146
"""Return true for letters combined with letters."""
138
147
return all (is_letter (table [i ], table ) for i in codepoint .combining_ids )
139
148
149
+
140
150
def get_plain_letters (codepoint , table ):
141
151
"""Return a list of plain letters from a ligature."""
142
152
assert (is_ligature (codepoint , table ))
143
153
return [get_plain_letter (table [id ], table ) for id in codepoint .combining_ids ]
144
154
155
+
145
156
def parse_cldr_latin_ascii_transliterator (latinAsciiFilePath ):
146
157
"""Parse the XML file and return a set of tuples (src, trg), where "src"
147
158
is the original character and "trg" the substitute."""
@@ -189,21 +200,23 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
189
200
190
201
return charactersSet
191
202
203
+
192
204
def special_cases ():
193
205
"""Returns the special cases which are not handled by other methods"""
194
206
charactersSet = set ()
195
207
196
208
# Cyrillic
197
- charactersSet .add ((0x0401 , "\u0415 " )) # CYRILLIC CAPITAL LETTER IO
198
- charactersSet .add ((0x0451 , "\u0435 " )) # CYRILLIC SMALL LETTER IO
209
+ charactersSet .add ((0x0401 , "\u0415 " )) # CYRILLIC CAPITAL LETTER IO
210
+ charactersSet .add ((0x0451 , "\u0435 " )) # CYRILLIC SMALL LETTER IO
199
211
200
212
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
201
- charactersSet .add ((0x2103 , "\xb0 C" )) # DEGREE CELSIUS
202
- charactersSet .add ((0x2109 , "\xb0 F" )) # DEGREE FAHRENHEIT
203
- charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
213
+ charactersSet .add ((0x2103 , "\xb0 C" )) # DEGREE CELSIUS
214
+ charactersSet .add ((0x2109 , "\xb0 F" )) # DEGREE FAHRENHEIT
215
+ charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
204
216
205
217
return charactersSet
206
218
219
+
207
220
def main (args ):
208
221
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
209
222
decomposition_type_pattern = re .compile (" *<[^>]*> *" )
@@ -238,12 +251,12 @@ def main(args):
238
251
len (codepoint .combining_ids ) > 1 :
239
252
if is_letter_with_marks (codepoint , table ):
240
253
charactersSet .add ((codepoint .id ,
241
- chr (get_plain_letter (codepoint , table ).id )))
254
+ chr (get_plain_letter (codepoint , table ).id )))
242
255
elif args .noLigaturesExpansion is False and is_ligature (codepoint , table ):
243
256
charactersSet .add ((codepoint .id ,
244
- "" .join (chr (combining_codepoint .id )
245
- for combining_codepoint \
246
- in get_plain_letters (codepoint , table ))))
257
+ "" .join (chr (combining_codepoint .id )
258
+ for combining_codepoint
259
+ in get_plain_letters (codepoint , table ))))
247
260
elif is_mark_to_remove (codepoint ):
248
261
charactersSet .add ((codepoint .id , None ))
249
262
@@ -258,6 +271,7 @@ def main(args):
258
271
for characterPair in charactersList :
259
272
print_record (characterPair [0 ], characterPair [1 ])
260
273
274
+
261
275
if __name__ == "__main__" :
262
276
parser = argparse .ArgumentParser (description = 'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.' )
263
277
parser .add_argument ("--unicode-data-file" , help = "Path to formatted text file corresponding to UnicodeData.txt." , type = str , required = True , dest = 'unicodeDataFilePath' )
0 commit comments