summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlvaro Herrera2019-09-10 20:56:11 +0000
committerAlvaro Herrera2019-09-10 21:15:15 +0000
commit0afc0a7841889c6221fd47430e72f4fe570833f4 (patch)
tree53f75d9eea31362478cd52a7f85c73ec58e5d696
parentb438e7e7a1c58e0c20b5f46e73cbd713e8033c69 (diff)
Fix unaccent generation script in Windows
As originally coded, the script would fail on Windows 10 and Python 3 because stdout would not be switched to UTF-8 only for Python 2. This patch makes that apply to both versions. Also add python 2 compatibility markers so that we know what to remove once we drop support for that. Also use a "with" clause to ensure file descriptor is closed promptly. Author: Hugh Ranalli, Ramanarayana Reviewed-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com Discussion: https://postgr.es/m/[email protected]
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py44
1 files changed, 24 insertions, 20 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index 58b6e7deb74..7a0a96e04f7 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -32,9 +32,15 @@
# The approach is to be Python3 compatible with Python2 "backports".
from __future__ import print_function
from __future__ import unicode_literals
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+
+import argparse
import codecs
+import re
import sys
+import xml.etree.ElementTree as ET
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
if sys.version_info[0] <= 2:
# Encode stdout as UTF-8, so we can just print to it
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
@@ -45,12 +51,9 @@ if sys.version_info[0] <= 2:
# Python 2 and 3 compatible bytes call
def bytes(source, encoding='ascii', errors='strict'):
return source.encode(encoding=encoding, errors=errors)
+else:
# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
-
-import re
-import argparse
-import sys
-import xml.etree.ElementTree as ET
+ sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
# The ranges of Unicode characters that we consider to be "plain letters".
# For now we are being conservative by including only Latin and Greek. This
@@ -233,21 +236,22 @@ def main(args):
charactersSet = set()
# read file UnicodeData.txt
- unicodeDataFile = open(args.unicodeDataFilePath, 'r')
-
- # read everything we need into memory
- for line in unicodeDataFile:
- fields = line.split(";")
- if len(fields) > 5:
- # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
- general_category = fields[2]
- decomposition = fields[5]
- decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
- id = int(fields[0], 16)
- combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
- codepoint = Codepoint(id, general_category, combining_ids)
- table[id] = codepoint
- all.append(codepoint)
+ with codecs.open(
+ args.unicodeDataFilePath, mode='r', encoding='UTF-8',
+ ) as unicodeDataFile:
+ # read everything we need into memory
+ for line in unicodeDataFile:
+ fields = line.split(";")
+ if len(fields) > 5:
+ # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+ general_category = fields[2]
+ decomposition = fields[5]
+ decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+ id = int(fields[0], 16)
+ combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+ codepoint = Codepoint(id, general_category, combining_ids)
+ table[id] = codepoint
+ all.append(codepoint)
# walk through all the codepoints looking for interesting mappings
for codepoint in all: