diff options
author | Marko Kreen | 2011-06-28 12:32:28 +0000 |
---|---|---|
committer | Marko Kreen | 2011-06-29 07:10:58 +0000 |
commit | 336e17cca156d8ccccddb43211c475d40a60eabb (patch) | |
tree | eda8d2390731a9040976334ebb6ada529ccbc187 | |
parent | 5f1e4b5ea594d32fdcd4c3d10d6d92ebc5f28b0f (diff) |
Simplify utf8 sanitizer and move under skytools/
This patch moves sanitizer under skytools.utf8 and makes
is generally usable, also better testable.
-rw-r--r-- | python/londiste/handlers/dispatch.py | 101 | ||||
-rw-r--r-- | python/skytools/__init__.py | 2 | ||||
-rw-r--r-- | python/skytools/utf8.py | 85 |
3 files changed, 113 insertions, 75 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py index f861af72..b2663448 100644 --- a/python/londiste/handlers/dispatch.py +++ b/python/londiste/handlers/dispatch.py @@ -149,6 +149,7 @@ import skytools from londiste.handler import BaseHandler from skytools import quote_ident, quote_fqident, UsageError from skytools.dbstruct import * +from skytools.utf8 import safe_utf8_decode __all__ = ['Dispatcher'] @@ -576,96 +577,46 @@ ROW_HANDLERS = {'plain': RowHandler, # ENCODING VALIDATOR #------------------------------------------------------------------------------ -# stores current EncodingValidator -FIXENC_DATA = None - -# find UTF16 surrogate pairs -_sgrc = re.compile(u""" - [\uD800-\uDBFF] [\uDC00-\uDFFF] ? - | [\uDC00-\uDFFF] - """, re.X) - class EncodingValidator: def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'): """validates the correctness of given encoding. when data contains illegal symbols, replaces them with <replacement> and logs the incident""" self.log = log - self.encoding = encoding - self.replacement = replacement self.columns = None self.error_count = 0 - def validate(self, data, columns): - """sets self to global FIXENC_DATA object and calls decode with - registered error handler""" - global FIXENC_DATA - FIXENC_DATA = self - self.columns = columns - self.error_count = 0 - _unicode = data.decode(self.encoding, "fixenc_error_handler") - # python does not tag surrogate pairs as error, fix them explicitly - _unicode = _sgrc.sub(self.sgfix, _unicode) - # when no erros then return input data as is, else re-encode fixed data - if self.error_count == 0: + def show_error(self, col, val): + self.log.error('Invalid UTF8 in column <%s>: %s', col, repr(val)) + + def validate_copy(self, data, columns): + """Validate tab-separated fields""" + + ok, _unicode = safe_utf8_decode(data) + if ok: return data - else: - return _unicode.encode(self.encoding) - - def sgfix(self, m): - """Fix UTF16 surrogate pair""" - self.error_count += 1 - val = m.group() - if len(val) == 2: - self.log.warning('combining utf16 surrogate pair') - c1 = ord(val[0]) - c2 = ord(val[1]) - c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) - return unichr(c) - else: - self.log.warning('replacing utf16 surrogate code') - return self.replacement + + # log error + vals = data.split('\t') + for i, v in enumerate(vals): + ok, tmp = safe_utf8_decode(v) + if not ok: + self.show_error(columns[i], v) + + # return safe data + return _unicode.encode('utf8') def validate_dict(self, data): """validates data in dict""" - for _key, _val in data.items(): - if _val: - _fixed = self.validate(_val, [_key]) - if self.error_count != 0: - data[_key] = _fixed + for k, v in data.items(): + if v: + ok, u = safe_utf8_decode(v) + if not ok: + self.show_error(k, v) + data[k] = u.encode('utf8') return data -def fixenc_error_handler(exc): - """when error occurs in decoding, replaces char causing it, logs errors - together with column name containing invalid data""" - global FIXENC_DATA - if not FIXENC_DATA: - raise exc - # process only UnicodeDecodeError - if not isinstance(exc, UnicodeDecodeError): - raise exc - # find starting position of line with error and log warning - _line_start = exc.object.rfind('\n', 0, exc.start) + 1 - try: - _col = FIXENC_DATA.columns[exc.object.count('\t', _line_start, exc.start)] - except Exception, e: - FIXENC_DATA.log.warning('Error when detecting column: %s' % e) - _col = '<unknown>' - _msg = "replacing invalid %s sequence %r in column %s"%\ - (FIXENC_DATA.encoding, exc.object[exc.start:exc.end], _col) - FIXENC_DATA.log.warning(_msg) - # increase error count - FIXENC_DATA.error_count += 1 - # return replacement char and position to continue from - # NB! doesn't replace multiple symbols, so it's harder to break file - # structure like replace \t or \n - return FIXENC_DATA.replacement, exc.start + 1 - - -codecs.register_error("fixenc_error_handler", fixenc_error_handler) - - #------------------------------------------------------------------------------ # DISPATCHER #------------------------------------------------------------------------------ @@ -925,7 +876,7 @@ class Dispatcher(BaseHandler): if self.encoding_validator: def _write_hook(obj, data): - return self.encoding_validator.validate(data, _src_cols) + return self.encoding_validator.validate_copy(data, _src_cols) return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, condition, self.table_name, _dst_cols, diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py index ba10f47b..837a57e2 100644 --- a/python/skytools/__init__.py +++ b/python/skytools/__init__.py @@ -104,6 +104,8 @@ _symbols = { 'run_lookup': 'skytools.querybuilder:run_lookup', 'run_query': 'skytools.querybuilder:run_query', 'run_query_row': 'skytools.querybuilder:run_query_row', + # skytools.utf8 + 'safe_utf8_decode': 'skytools.utf8:safe_utf8_decode', } __all__ = _symbols.keys() diff --git a/python/skytools/utf8.py b/python/skytools/utf8.py new file mode 100644 index 00000000..e25888b6 --- /dev/null +++ b/python/skytools/utf8.py @@ -0,0 +1,85 @@ +r"""UTF-8 sanitizer. + +Python's UTF-8 parser is quite relaxed, this creates problems when +talking with other software that uses stricter parsers. + +>>> safe_utf8_decode("foobar") +(True, u'foobar') +>>> safe_utf8_decode('X\xed\xa0\x80Y\xed\xb0\x89Z') +(False, u'X\ufffdY\ufffdZ') +>>> safe_utf8_decode('X\xed\xa0\x80\xed\xb0\x89Z') +(False, u'X\U00010009Z') +>>> safe_utf8_decode('X\0Z') +(False, u'X\ufffdZ') +>>> safe_utf8_decode('OK') +(True, u'OK') +""" + +import re + +__all__ = ['safe_utf8_decode'] + +# by default, use same symbol as 'replace' +REPLACEMENT_SYMBOL = unichr(0xFFFD) + +def _fix_utf8(m): + """Merge UTF16 surrogates, replace others""" + u = m.group() + if len(u) == 2: + # merge into single symbol + c1 = ord(u[0]) + c2 = ord(u[1]) + c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + return unichr(c) + else: + # use replacement symbol + return REPLACEMENT_SYMBOL + +_urc = None + +def sanitize_unicode(u): + """Fix invalid symbols in unicode string.""" + global _urc + + assert isinstance(u, unicode) + + # regex for finding invalid chars, works on unicode string + if not _urc: + rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]" + _urc = re.compile(rx, re.X) + + # now find and fix UTF16 surrogates + m = _urc.search(u) + if m: + u = _urc.sub(_fix_utf8, u) + return u + +def safe_utf8_decode(s): + """Decode UTF-8 safely. + + Acts like str.decode('utf8', 'replace') but also fixes + UTF16 surrogates and NUL bytes, which Python's default + decoder does not do. + + @param s: utf8-encoded byte string + @return: tuple of (was_valid_utf8, unicode_string) + """ + + # decode with error detection + ok = True + try: + # expect no errors by default + u = s.decode('utf8') + except UnicodeDecodeError: + u = s.decode('utf8', 'replace') + ok = False + + u2 = sanitize_unicode(u) + if u is not u2: + ok = False + return (ok, u2) + +if __name__ == '__main__': + import doctest + doctest.testmod() + |