Simplify utf8 sanitizer and move under skytools/

This patch moves sanitizer under skytools.utf8 and makes is generally usable, also better testable.
author: Marko Kreen 2011-06-28 12:32:28 +0000
committer: Marko Kreen 2011-06-29 07:10:58 +0000
commit: 336e17cca156d8ccccddb43211c475d40a60eabb (patch)
tree: eda8d2390731a9040976334ebb6ada529ccbc187
parent: 5f1e4b5ea594d32fdcd4c3d10d6d92ebc5f28b0f (diff)
3 files changed, 113 insertions, 75 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py
index f861af72..b2663448 100644
--- a/python/londiste/handlers/dispatch.py
+++ b/python/londiste/handlers/dispatch.py
@@ -149,6 +149,7 @@ import skytools
 from londiste.handler import BaseHandler
 from skytools import quote_ident, quote_fqident, UsageError
 from skytools.dbstruct import *
+from skytools.utf8 import safe_utf8_decode
 
 __all__ = ['Dispatcher']
 
@@ -576,96 +577,46 @@ ROW_HANDLERS = {'plain': RowHandler,
 # ENCODING VALIDATOR
 #------------------------------------------------------------------------------
 
-# stores current EncodingValidator
-FIXENC_DATA = None
-
-# find UTF16 surrogate pairs
-_sgrc = re.compile(u"""
-            [\uD800-\uDBFF] [\uDC00-\uDFFF] ?
-          | [\uDC00-\uDFFF]
-      """, re.X)
-
 class EncodingValidator:
     def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'):
         """validates the correctness of given encoding. when data contains 
         illegal symbols, replaces them with <replacement> and logs the
         incident"""
         self.log = log
-        self.encoding = encoding
-        self.replacement = replacement
         self.columns = None
         self.error_count = 0
 
-    def validate(self, data, columns):
-        """sets self to global FIXENC_DATA object and calls decode with
-        registered error handler"""
-        global FIXENC_DATA
-        FIXENC_DATA = self
-        self.columns = columns
-        self.error_count = 0
-        _unicode = data.decode(self.encoding, "fixenc_error_handler")
-        # python does not tag surrogate pairs as error, fix them explicitly
-        _unicode = _sgrc.sub(self.sgfix, _unicode)
-        # when no erros then return input data as is, else re-encode fixed data
-        if self.error_count == 0:
+    def show_error(self, col, val):
+        self.log.error('Invalid UTF8 in column <%s>: %s', col, repr(val))
+
+    def validate_copy(self, data, columns):
+        """Validate tab-separated fields"""
+
+        ok, _unicode = safe_utf8_decode(data)
+        if ok:
             return data
-        else:
-            return _unicode.encode(self.encoding)
-
-    def sgfix(self, m):
-        """Fix  UTF16 surrogate pair"""
-        self.error_count += 1
-        val = m.group()
-        if len(val) == 2:
-            self.log.warning('combining utf16 surrogate pair')
-            c1 = ord(val[0])
-            c2 = ord(val[1])
-            c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
-            return unichr(c)
-        else:
-            self.log.warning('replacing utf16 surrogate code')
-            return self.replacement
+
+        # log error
+        vals = data.split('\t')
+        for i, v in enumerate(vals):
+            ok, tmp = safe_utf8_decode(v)
+            if not ok:
+                self.show_error(columns[i], v)
+
+        # return safe data
+        return _unicode.encode('utf8')
 
     def validate_dict(self, data):
         """validates data in dict"""
-        for _key, _val in data.items():
-            if _val:
-                _fixed = self.validate(_val, [_key])
-                if self.error_count != 0:
-                    data[_key] = _fixed
+        for k, v in data.items():
+            if v:
+                ok, u = safe_utf8_decode(v)
+                if not ok:
+                    self.show_error(k, v)
+                    data[k] = u.encode('utf8')
         return data
 
 
-def fixenc_error_handler(exc):
-    """when error occurs in decoding, replaces char causing it, logs errors
-    together with column name containing invalid data"""
-    global FIXENC_DATA
-    if not FIXENC_DATA:
-        raise exc
-    # process only UnicodeDecodeError
-    if not isinstance(exc, UnicodeDecodeError):
-        raise exc
-    # find starting position of line with error and log warning
-    _line_start = exc.object.rfind('\n', 0, exc.start) + 1
-    try:
-        _col = FIXENC_DATA.columns[exc.object.count('\t', _line_start, exc.start)]
-    except Exception, e:
-        FIXENC_DATA.log.warning('Error when detecting column: %s' % e)
-        _col = '<unknown>'
-    _msg = "replacing invalid %s sequence %r in column %s"%\
-           (FIXENC_DATA.encoding, exc.object[exc.start:exc.end], _col)
-    FIXENC_DATA.log.warning(_msg)
-    # increase error count
-    FIXENC_DATA.error_count += 1
-    # return replacement char and position to continue from
-    # NB! doesn't replace multiple symbols, so it's harder to break file
-    # structure like replace \t or \n
-    return FIXENC_DATA.replacement, exc.start + 1
-
-
-codecs.register_error("fixenc_error_handler", fixenc_error_handler)
-
-
 #------------------------------------------------------------------------------
 # DISPATCHER
 #------------------------------------------------------------------------------
@@ -925,7 +876,7 @@ class Dispatcher(BaseHandler):
 
         if self.encoding_validator:
             def _write_hook(obj, data):
-                return self.encoding_validator.validate(data, _src_cols)
+                return self.encoding_validator.validate_copy(data, _src_cols)
 
         return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols,
                                   condition, self.table_name, _dst_cols,
diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py
index ba10f47b..837a57e2 100644
--- a/python/skytools/__init__.py
+++ b/python/skytools/__init__.py
@@ -104,6 +104,8 @@ _symbols = {
     'run_lookup': 'skytools.querybuilder:run_lookup',
     'run_query': 'skytools.querybuilder:run_query',
     'run_query_row': 'skytools.querybuilder:run_query_row',
+    # skytools.utf8
+    'safe_utf8_decode': 'skytools.utf8:safe_utf8_decode',
 }
 
 __all__ = _symbols.keys()
diff --git a/python/skytools/utf8.py b/python/skytools/utf8.py
new file mode 100644
index 00000000..e25888b6
--- /dev/null
+++ b/python/skytools/utf8.py
@@ -0,0 +1,85 @@
+r"""UTF-8 sanitizer.
+
+Python's UTF-8 parser is quite relaxed, this creates problems when
+talking with other software that uses stricter parsers.
+
+>>> safe_utf8_decode("foobar")
+(True, u'foobar')
+>>> safe_utf8_decode('X\xed\xa0\x80Y\xed\xb0\x89Z')
+(False, u'X\ufffdY\ufffdZ')
+>>> safe_utf8_decode('X\xed\xa0\x80\xed\xb0\x89Z')
+(False, u'X\U00010009Z')
+>>> safe_utf8_decode('X\0Z')
+(False, u'X\ufffdZ')
+>>> safe_utf8_decode('OK')
+(True, u'OK')
+"""
+
+import re
+
+__all__ = ['safe_utf8_decode']
+
+# by default, use same symbol as 'replace'
+REPLACEMENT_SYMBOL = unichr(0xFFFD)
+
+def _fix_utf8(m):
+    """Merge UTF16 surrogates, replace others"""
+    u = m.group()
+    if len(u) == 2:
+        # merge into single symbol
+        c1 = ord(u[0])
+        c2 = ord(u[1])
+        c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
+        return unichr(c)
+    else:
+        # use replacement symbol
+        return REPLACEMENT_SYMBOL
+
+_urc = None
+
+def sanitize_unicode(u):
+    """Fix invalid symbols in unicode string."""
+    global _urc
+
+    assert isinstance(u, unicode)
+
+    # regex for finding invalid chars, works on unicode string
+    if not _urc:
+        rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]"
+        _urc = re.compile(rx, re.X)
+
+    # now find and fix UTF16 surrogates
+    m = _urc.search(u)
+    if m:
+        u = _urc.sub(_fix_utf8, u)
+    return u
+
+def safe_utf8_decode(s):
+    """Decode UTF-8 safely.
+
+    Acts like str.decode('utf8', 'replace') but also fixes
+    UTF16 surrogates and NUL bytes, which Python's default
+    decoder does not do.
+    
+    @param s: utf8-encoded byte string
+    @return: tuple of (was_valid_utf8, unicode_string) 
+    """
+
+    # decode with error detection
+    ok = True
+    try:
+        # expect no errors by default
+        u = s.decode('utf8')
+    except UnicodeDecodeError:
+        u = s.decode('utf8', 'replace')
+        ok = False
+    
+    u2 = sanitize_unicode(u)
+    if u is not u2:
+        ok = False
+    return (ok, u2)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
+
author	Marko Kreen	2011-06-28 12:32:28 +0000
committer	Marko Kreen	2011-06-29 07:10:58 +0000
commit	336e17cca156d8ccccddb43211c475d40a60eabb (patch)
tree	eda8d2390731a9040976334ebb6ada529ccbc187
parent	5f1e4b5ea594d32fdcd4c3d10d6d92ebc5f28b0f (diff)