diff options
author | Alvaro Herrera | 2019-09-10 20:56:11 +0000 |
---|---|---|
committer | Alvaro Herrera | 2019-09-10 21:15:15 +0000 |
commit | 0afc0a7841889c6221fd47430e72f4fe570833f4 (patch) | |
tree | 53f75d9eea31362478cd52a7f85c73ec58e5d696 | |
parent | b438e7e7a1c58e0c20b5f46e73cbd713e8033c69 (diff) |
Fix unaccent generation script in Windows
As originally coded, the script would fail on Windows 10 and Python 3
because stdout would not be switched to UTF-8 only for Python 2. This
patch makes that apply to both versions.
Also add python 2 compatibility markers so that we know what to remove
once we drop support for that. Also use a "with" clause to ensure file
descriptor is closed promptly.
Author: Hugh Ranalli, Ramanarayana
Reviewed-by: Kyotaro Horiguchi
Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com
Discussion: https://postgr.es/m/[email protected]
-rw-r--r-- | contrib/unaccent/generate_unaccent_rules.py | 44 |
1 files changed, 24 insertions, 20 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 58b6e7deb74..7a0a96e04f7 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -32,9 +32,15 @@ # The approach is to be Python3 compatible with Python2 "backports". from __future__ import print_function from __future__ import unicode_literals +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped + +import argparse import codecs +import re import sys +import xml.etree.ElementTree as ET +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped if sys.version_info[0] <= 2: # Encode stdout as UTF-8, so we can just print to it sys.stdout = codecs.getwriter('utf8')(sys.stdout) @@ -45,12 +51,9 @@ if sys.version_info[0] <= 2: # Python 2 and 3 compatible bytes call def bytes(source, encoding='ascii', errors='strict'): return source.encode(encoding=encoding, errors=errors) +else: # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped - -import re -import argparse -import sys -import xml.etree.ElementTree as ET + sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) # The ranges of Unicode characters that we consider to be "plain letters". # For now we are being conservative by including only Latin and Greek. This @@ -233,21 +236,22 @@ def main(args): charactersSet = set() # read file UnicodeData.txt - unicodeDataFile = open(args.unicodeDataFilePath, 'r') - - # read everything we need into memory - for line in unicodeDataFile: - fields = line.split(";") - if len(fields) > 5: - # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt - general_category = fields[2] - decomposition = fields[5] - decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) - id = int(fields[0], 16) - combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] - codepoint = Codepoint(id, general_category, combining_ids) - table[id] = codepoint - all.append(codepoint) + with codecs.open( + args.unicodeDataFilePath, mode='r', encoding='UTF-8', + ) as unicodeDataFile: + # read everything we need into memory + for line in unicodeDataFile: + fields = line.split(";") + if len(fields) > 5: + # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt + general_category = fields[2] + decomposition = fields[5] + decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) + id = int(fields[0], 16) + combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] + codepoint = Codepoint(id, general_category, combining_ids) + table[id] = codepoint + all.append(codepoint) # walk through all the codepoints looking for interesting mappings for codepoint in all: |