|
20 | 20 | # option is enabled, the XML file of this transliterator [2] -- given as a
|
21 | 21 | # command line argument -- will be parsed and used.
|
22 | 22 | #
|
| 23 | +# Ideally you should use the latest release for each data set. For |
| 24 | +# Latin-ASCII.xml, the latest data sets released can be browsed directly |
| 25 | +# via [3]. Note that this script is compatible with at least release 29. |
| 26 | +# |
23 | 27 | # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
24 |
| -# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml |
| 28 | +# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml |
| 29 | +# [3] https://unicode.org/cldr/trac/browser/tags |
25 | 30 |
|
26 | 31 | # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
27 | 32 | # The approach is to be Python3 compatible with Python2 "backports".
|
@@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
|
140 | 145 | transliterationTree = ET.parse(latinAsciiFilePath)
|
141 | 146 | transliterationTreeRoot = transliterationTree.getroot()
|
142 | 147 |
|
143 |
| - for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"): |
144 |
| - matches = rulePattern.search(rule.text) |
| 148 | + # Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml |
| 149 | + # all the transliteration rules are located in a single tRule block with |
| 150 | + # all rules separated into separate lines. |
| 151 | + blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule") |
| 152 | + assert(len(blockRules) == 1) |
| 153 | + |
| 154 | + # Split the block of rules into one element per line. |
| 155 | + rules = blockRules[0].text.splitlines() |
| 156 | + |
| 157 | + # And finish the processing of each individual rule. |
| 158 | + for rule in rules: |
| 159 | + matches = rulePattern.search(rule) |
145 | 160 |
|
146 | 161 | # The regular expression capture four groups corresponding
|
147 | 162 | # to the characters.
|
|
0 commit comments