summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Kreen2011-07-05 09:48:06 +0000
committerMarko Kreen2011-07-05 09:48:06 +0000
commit7e52d67c4079023bd4a92acd493aa872adc22c91 (patch)
treea22685cf10de18c1aead7696f4a1f1b04190b2c8
parent856fa31833d666e3ab6bc211bf71554caad8ac59 (diff)
safe_utf8_decode: stop using builtin 'replace', its broken
Default 'replace' can eat several symbols, which means it can corrupt even regular ascii.
-rw-r--r--python/skytools/utf8.py26
1 files changed, 24 insertions, 2 deletions
diff --git a/python/skytools/utf8.py b/python/skytools/utf8.py
index e25888b6..baf5b794 100644
--- a/python/skytools/utf8.py
+++ b/python/skytools/utf8.py
@@ -13,9 +13,11 @@ talking with other software that uses stricter parsers.
(False, u'X\ufffdZ')
>>> safe_utf8_decode('OK')
(True, u'OK')
+>>> safe_utf8_decode('X\xF1Y')
+(False, u'X\ufffdY')
"""
-import re
+import re, codecs
__all__ = ['safe_utf8_decode']
@@ -54,6 +56,26 @@ def sanitize_unicode(u):
u = _urc.sub(_fix_utf8, u)
return u
+def safe_replace(exc):
+ """Replace only one symbol at a time.
+
+ Builtin .decode('xxx', 'replace') replaces several symbols
+ together, which is unsafe.
+ """
+ if not isinstance(exc, UnicodeDecodeError):
+ raise exc
+ c2 = REPLACEMENT_SYMBOL
+
+ # we could assume latin1
+ if 0:
+ c1 = exc.object[exc.start]
+ c2 = unichr(ord(c1))
+
+ return c2, exc.start + 1
+
+# register, it will be globally available
+codecs.register_error("safe_replace", safe_replace)
+
def safe_utf8_decode(s):
"""Decode UTF-8 safely.
@@ -71,7 +93,7 @@ def safe_utf8_decode(s):
# expect no errors by default
u = s.decode('utf8')
except UnicodeDecodeError:
- u = s.decode('utf8', 'replace')
+ u = s.decode('utf8', 'safe_replace')
ok = False
u2 = sanitize_unicode(u)