summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Kreen2011-06-28 12:32:28 +0000
committerMarko Kreen2011-06-29 07:10:58 +0000
commit336e17cca156d8ccccddb43211c475d40a60eabb (patch)
treeeda8d2390731a9040976334ebb6ada529ccbc187
parent5f1e4b5ea594d32fdcd4c3d10d6d92ebc5f28b0f (diff)
Simplify utf8 sanitizer and move under skytools/
This patch moves sanitizer under skytools.utf8 and makes is generally usable, also better testable.
-rw-r--r--python/londiste/handlers/dispatch.py101
-rw-r--r--python/skytools/__init__.py2
-rw-r--r--python/skytools/utf8.py85
3 files changed, 113 insertions, 75 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py
index f861af72..b2663448 100644
--- a/python/londiste/handlers/dispatch.py
+++ b/python/londiste/handlers/dispatch.py
@@ -149,6 +149,7 @@ import skytools
from londiste.handler import BaseHandler
from skytools import quote_ident, quote_fqident, UsageError
from skytools.dbstruct import *
+from skytools.utf8 import safe_utf8_decode
__all__ = ['Dispatcher']
@@ -576,96 +577,46 @@ ROW_HANDLERS = {'plain': RowHandler,
# ENCODING VALIDATOR
#------------------------------------------------------------------------------
-# stores current EncodingValidator
-FIXENC_DATA = None
-
-# find UTF16 surrogate pairs
-_sgrc = re.compile(u"""
- [\uD800-\uDBFF] [\uDC00-\uDFFF] ?
- | [\uDC00-\uDFFF]
- """, re.X)
-
class EncodingValidator:
def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'):
"""validates the correctness of given encoding. when data contains
illegal symbols, replaces them with <replacement> and logs the
incident"""
self.log = log
- self.encoding = encoding
- self.replacement = replacement
self.columns = None
self.error_count = 0
- def validate(self, data, columns):
- """sets self to global FIXENC_DATA object and calls decode with
- registered error handler"""
- global FIXENC_DATA
- FIXENC_DATA = self
- self.columns = columns
- self.error_count = 0
- _unicode = data.decode(self.encoding, "fixenc_error_handler")
- # python does not tag surrogate pairs as error, fix them explicitly
- _unicode = _sgrc.sub(self.sgfix, _unicode)
- # when no erros then return input data as is, else re-encode fixed data
- if self.error_count == 0:
+ def show_error(self, col, val):
+ self.log.error('Invalid UTF8 in column <%s>: %s', col, repr(val))
+
+ def validate_copy(self, data, columns):
+ """Validate tab-separated fields"""
+
+ ok, _unicode = safe_utf8_decode(data)
+ if ok:
return data
- else:
- return _unicode.encode(self.encoding)
-
- def sgfix(self, m):
- """Fix UTF16 surrogate pair"""
- self.error_count += 1
- val = m.group()
- if len(val) == 2:
- self.log.warning('combining utf16 surrogate pair')
- c1 = ord(val[0])
- c2 = ord(val[1])
- c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
- return unichr(c)
- else:
- self.log.warning('replacing utf16 surrogate code')
- return self.replacement
+
+ # log error
+ vals = data.split('\t')
+ for i, v in enumerate(vals):
+ ok, tmp = safe_utf8_decode(v)
+ if not ok:
+ self.show_error(columns[i], v)
+
+ # return safe data
+ return _unicode.encode('utf8')
def validate_dict(self, data):
"""validates data in dict"""
- for _key, _val in data.items():
- if _val:
- _fixed = self.validate(_val, [_key])
- if self.error_count != 0:
- data[_key] = _fixed
+ for k, v in data.items():
+ if v:
+ ok, u = safe_utf8_decode(v)
+ if not ok:
+ self.show_error(k, v)
+ data[k] = u.encode('utf8')
return data
-def fixenc_error_handler(exc):
- """when error occurs in decoding, replaces char causing it, logs errors
- together with column name containing invalid data"""
- global FIXENC_DATA
- if not FIXENC_DATA:
- raise exc
- # process only UnicodeDecodeError
- if not isinstance(exc, UnicodeDecodeError):
- raise exc
- # find starting position of line with error and log warning
- _line_start = exc.object.rfind('\n', 0, exc.start) + 1
- try:
- _col = FIXENC_DATA.columns[exc.object.count('\t', _line_start, exc.start)]
- except Exception, e:
- FIXENC_DATA.log.warning('Error when detecting column: %s' % e)
- _col = '<unknown>'
- _msg = "replacing invalid %s sequence %r in column %s"%\
- (FIXENC_DATA.encoding, exc.object[exc.start:exc.end], _col)
- FIXENC_DATA.log.warning(_msg)
- # increase error count
- FIXENC_DATA.error_count += 1
- # return replacement char and position to continue from
- # NB! doesn't replace multiple symbols, so it's harder to break file
- # structure like replace \t or \n
- return FIXENC_DATA.replacement, exc.start + 1
-
-
-codecs.register_error("fixenc_error_handler", fixenc_error_handler)
-
-
#------------------------------------------------------------------------------
# DISPATCHER
#------------------------------------------------------------------------------
@@ -925,7 +876,7 @@ class Dispatcher(BaseHandler):
if self.encoding_validator:
def _write_hook(obj, data):
- return self.encoding_validator.validate(data, _src_cols)
+ return self.encoding_validator.validate_copy(data, _src_cols)
return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols,
condition, self.table_name, _dst_cols,
diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py
index ba10f47b..837a57e2 100644
--- a/python/skytools/__init__.py
+++ b/python/skytools/__init__.py
@@ -104,6 +104,8 @@ _symbols = {
'run_lookup': 'skytools.querybuilder:run_lookup',
'run_query': 'skytools.querybuilder:run_query',
'run_query_row': 'skytools.querybuilder:run_query_row',
+ # skytools.utf8
+ 'safe_utf8_decode': 'skytools.utf8:safe_utf8_decode',
}
__all__ = _symbols.keys()
diff --git a/python/skytools/utf8.py b/python/skytools/utf8.py
new file mode 100644
index 00000000..e25888b6
--- /dev/null
+++ b/python/skytools/utf8.py
@@ -0,0 +1,85 @@
+r"""UTF-8 sanitizer.
+
+Python's UTF-8 parser is quite relaxed, this creates problems when
+talking with other software that uses stricter parsers.
+
+>>> safe_utf8_decode("foobar")
+(True, u'foobar')
+>>> safe_utf8_decode('X\xed\xa0\x80Y\xed\xb0\x89Z')
+(False, u'X\ufffdY\ufffdZ')
+>>> safe_utf8_decode('X\xed\xa0\x80\xed\xb0\x89Z')
+(False, u'X\U00010009Z')
+>>> safe_utf8_decode('X\0Z')
+(False, u'X\ufffdZ')
+>>> safe_utf8_decode('OK')
+(True, u'OK')
+"""
+
+import re
+
+__all__ = ['safe_utf8_decode']
+
+# by default, use same symbol as 'replace'
+REPLACEMENT_SYMBOL = unichr(0xFFFD)
+
+def _fix_utf8(m):
+ """Merge UTF16 surrogates, replace others"""
+ u = m.group()
+ if len(u) == 2:
+ # merge into single symbol
+ c1 = ord(u[0])
+ c2 = ord(u[1])
+ c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
+ return unichr(c)
+ else:
+ # use replacement symbol
+ return REPLACEMENT_SYMBOL
+
+_urc = None
+
+def sanitize_unicode(u):
+ """Fix invalid symbols in unicode string."""
+ global _urc
+
+ assert isinstance(u, unicode)
+
+ # regex for finding invalid chars, works on unicode string
+ if not _urc:
+ rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]"
+ _urc = re.compile(rx, re.X)
+
+ # now find and fix UTF16 surrogates
+ m = _urc.search(u)
+ if m:
+ u = _urc.sub(_fix_utf8, u)
+ return u
+
+def safe_utf8_decode(s):
+ """Decode UTF-8 safely.
+
+ Acts like str.decode('utf8', 'replace') but also fixes
+ UTF16 surrogates and NUL bytes, which Python's default
+ decoder does not do.
+
+ @param s: utf8-encoded byte string
+ @return: tuple of (was_valid_utf8, unicode_string)
+ """
+
+ # decode with error detection
+ ok = True
+ try:
+ # expect no errors by default
+ u = s.decode('utf8')
+ except UnicodeDecodeError:
+ u = s.decode('utf8', 'replace')
+ ok = False
+
+ u2 = sanitize_unicode(u)
+ if u is not u2:
+ ok = False
+ return (ok, u2)
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
+