diff options
author | Marko Kreen | 2011-07-05 09:48:06 +0000 |
---|---|---|
committer | Marko Kreen | 2011-07-05 09:48:06 +0000 |
commit | 7e52d67c4079023bd4a92acd493aa872adc22c91 (patch) | |
tree | a22685cf10de18c1aead7696f4a1f1b04190b2c8 | |
parent | 856fa31833d666e3ab6bc211bf71554caad8ac59 (diff) |
safe_utf8_decode: stop using builtin 'replace', its broken
Default 'replace' can eat several symbols, which means
it can corrupt even regular ascii.
-rw-r--r-- | python/skytools/utf8.py | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/python/skytools/utf8.py b/python/skytools/utf8.py index e25888b6..baf5b794 100644 --- a/python/skytools/utf8.py +++ b/python/skytools/utf8.py @@ -13,9 +13,11 @@ talking with other software that uses stricter parsers. (False, u'X\ufffdZ') >>> safe_utf8_decode('OK') (True, u'OK') +>>> safe_utf8_decode('X\xF1Y') +(False, u'X\ufffdY') """ -import re +import re, codecs __all__ = ['safe_utf8_decode'] @@ -54,6 +56,26 @@ def sanitize_unicode(u): u = _urc.sub(_fix_utf8, u) return u +def safe_replace(exc): + """Replace only one symbol at a time. + + Builtin .decode('xxx', 'replace') replaces several symbols + together, which is unsafe. + """ + if not isinstance(exc, UnicodeDecodeError): + raise exc + c2 = REPLACEMENT_SYMBOL + + # we could assume latin1 + if 0: + c1 = exc.object[exc.start] + c2 = unichr(ord(c1)) + + return c2, exc.start + 1 + +# register, it will be globally available +codecs.register_error("safe_replace", safe_replace) + def safe_utf8_decode(s): """Decode UTF-8 safely. @@ -71,7 +93,7 @@ def safe_utf8_decode(s): # expect no errors by default u = s.decode('utf8') except UnicodeDecodeError: - u = s.decode('utf8', 'replace') + u = s.decode('utf8', 'safe_replace') ok = False u2 = sanitize_unicode(u) |