diff options
author | Egon Valdmees | 2011-05-05 12:37:54 +0000 |
---|---|---|
committer | Marko Kreen | 2011-05-11 09:39:40 +0000 |
commit | ea4ee34522e75908f11f513d8ed040c6580b97fb (patch) | |
tree | 528ce7d743ff588b0151cee48b10ed59d56ab66f | |
parent | 8abaf61c48f72e573d076a585d334d059f71fa14 (diff) |
OCM-2279: londiste3 should translate everything to utf8, in a lossy way
where appropriate
* added argument 'encoding' to dispatcher handler
* tests for invalid utf8 sequences
* support for renamed table copy in dispatcher handler
-rw-r--r-- | python/londiste/bublin.py | 7 | ||||
-rw-r--r-- | python/londiste/handler.py | 11 | ||||
-rw-r--r-- | python/londiste/handlers/dispatch.py | 154 | ||||
-rw-r--r-- | python/londiste/table_copy.py | 4 | ||||
-rw-r--r-- | python/skytools/sqltools.py | 28 | ||||
-rwxr-xr-x | tests/handler/init.sh | 13 | ||||
-rwxr-xr-x | tests/handler/regen.sh | 12 | ||||
-rwxr-xr-x | tests/noqueue_merge/regen.sh | 13 |
8 files changed, 188 insertions, 54 deletions
diff --git a/python/londiste/bublin.py b/python/londiste/bublin.py index 05ffc80e..78e2ae5c 100644 --- a/python/londiste/bublin.py +++ b/python/londiste/bublin.py @@ -48,16 +48,17 @@ class Bublin(BaseHandler): return BaseHandler.process_event(self, ev, sql_queue_func, arg) - def prepare_copy(self, expr_list, dst_curs): + def real_copy(self, tablename, src_curs, dst_curs, column_list, cond_list): """Copy only slots needed locally.""" self.load_bubbles(dst_curs) slist = self.bubbles_local_slots.keys() fn = 'hashtext(%s)' % skytools.quote_ident(self.key) w = "(((%s) & %d) in (%s))" % (fn, self.bubbles_max_slot, slist) - expr_list.append(w) + cond_list.append(w) - BaseHandler.prepare_copy(self, expr_list, dst_curs) + return BaseHandler.real_copy(self, tablename, src_curs, dst_curs, + column_list, cond_list) def load_bubbles(self, curs): """Load slot info from database.""" diff --git a/python/londiste/handler.py b/python/londiste/handler.py index 7477675f..a0c72b07 100644 --- a/python/londiste/handler.py +++ b/python/londiste/handler.py @@ -99,12 +99,13 @@ class BaseHandler: """Called when batch finishes.""" pass - def prepare_copy(self, expr_list, dst_curs): - """Can change COPY behaviour. - - Returns new expr. + def real_copy(self, tablename, src_curs, dst_curs, column_list, cond_list): + """do actual table copy and return tuple with number of bytes and rows + copyed """ - pass + condition = ' and '.join(cond_list) + return skytools.full_copy(tablename, src_curs, dst_curs, column_list, + condition) class TableHandler(BaseHandler): """Default Londiste handler, inserts events into tables with plain SQL.""" diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py index cdb6086c..72704ac2 100644 --- a/python/londiste/handlers/dispatch.py +++ b/python/londiste/handlers/dispatch.py @@ -34,6 +34,7 @@ * bulk_yearly_batch * bulk_yearly_field * bulk_yearly_time +* bulk_direct - functionally identical to bulk == HANDLER ARGUMENTS == @@ -130,6 +131,10 @@ post_part: sql statement(s) to execute after creating partition table. Usable variables are the same as in part_template +encoding: + name of destination encoding. handler replaces all invalid encoding symbols + and logs them as warnings + NB! londiste3 does not currently support table renaming and field mapping when creating or coping initial data to destination table. --expect-sync and --skip-truncate should be used and --create switch is to be avoided. @@ -138,6 +143,7 @@ creating or coping initial data to destination table. --expect-sync and import sys import datetime import new +import codecs import skytools from londiste.handler import BaseHandler from skytools import quote_ident, quote_fqident, UsageError @@ -172,9 +178,10 @@ PART_FUNC_CALL = 'select %s(%s)' % (PART_FUNC, ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS)) -#---------------------------------------- + +#------------------------------------------------------------------------------ # LOADERS -#---------------------------------------- +#------------------------------------------------------------------------------ class BaseLoader: @@ -449,7 +456,7 @@ class BulkLoader(BaseBulkTempLoader): else: # fscking problems with long-lived temp tables self.drop(curs) - + def create_temp(self, curs): """ check if temp table exists. Returns False if using existing temp table and True if creating new @@ -463,7 +470,7 @@ class BulkLoader(BaseBulkTempLoader): return True def bulk_insert(self, curs, data, table = None): - """Copy data to table. If table not provided, use temp table. + """Copy data to table. If table not provided, use temp table. When re-using existing temp table, it is always truncated first and analyzed after copy. """ @@ -472,10 +479,10 @@ class BulkLoader(BaseBulkTempLoader): _use_temp = table is None # if table not specified use temp if _use_temp: - table = self.temp + table = self.temp # truncate when re-using existing table if not self.create_temp(curs): - self.truncate(curs) + self.truncate(curs) self.log.debug("bulk: COPY %d rows into %s" % (len(data), table)) skytools.magic_insert(curs, table, data, self.fields, quoted_table = True) @@ -505,9 +512,10 @@ class BulkLoader(BaseBulkTempLoader): LOADERS = {'direct': DirectLoader, 'bulk': BulkLoader} -#---------------------------------------- + +#------------------------------------------------------------------------------ # ROW HANDLERS -#---------------------------------------- +#------------------------------------------------------------------------------ class RowHandler: @@ -562,16 +570,61 @@ ROW_HANDLERS = {'plain': RowHandler, 'keep_latest': KeepLatestRowHandler} -#---------------------------------------- -# DISPATCHER -#---------------------------------------- -class AttrDict(dict): - """Dict with values accessible with attributes""" - def __getattr__(self, name): - return self[name] - def __setattr__(self, name, value): - self[name] = value +#------------------------------------------------------------------------------ +# ENCODING VALIDATOR +#------------------------------------------------------------------------------ + + +class EncodingValidator: + def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'): + self.log = log + self.encoding = encoding + self.replacement = replacement + self.columns = None + self.error_count = 0 + codecs.register_error("error_handler", self._error_handler) + + def _error_handler(self, exc): + # process only UnicodeDecodeError + if not isinstance(exc, UnicodeDecodeError): + raise exc + # find starting position of line with error and log warning + _line_start = exc.object.rfind('\n', 0, exc.start) + 1 + _col = self.columns[exc.object.count('\t', _line_start, exc.start)] + _msg = "replacing invalid %s sequence %r in column %s"%\ + (self.encoding, exc.object[exc.start:exc.end], _col) + self.log.warning(_msg) + # increase error count + self.error_count += 1 + # return replacement char and position to continue from + # NB! doesn't replace multiple symbols, so it's harder to break file + # structure like replace \t or \n + return self.replacement, exc.start + 1 + + def validate(self, data, columns): + self.columns = columns + self.error_count = 0 + _unicode = data.decode(self.encoding, "error_handler") + # when no erros then return input data as is, else re-encode fixed data + if self.error_count == 0: + return data + else: + return _unicode.encode(self.encoding) + + def validate_dict(self, data): + _cols, _vals = zip(*data.items()) + _fixed = self.validate('\t'.join(_vals), _cols) + if self.error_count == 0: + return data + else: + return dict(zip(_cols, _fixed.split('\t'))) + + + +#------------------------------------------------------------------------------ +# DISPATCHER +#------------------------------------------------------------------------------ class Dispatcher(BaseHandler): @@ -596,10 +649,15 @@ class Dispatcher(BaseHandler): self.conf = self.get_config() hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) + if self.conf.encoding: + self.encoding_validator = EncodingValidator(self.log, + self.conf.encoding) + else: + self.encoding_validator = None def get_config(self): """Processes args dict""" - conf = AttrDict() + conf = skytools.dbdict() # set table mode conf.table_mode = self.get_arg('table_mode', TABLE_MODES) if conf.table_mode == 'part': @@ -641,6 +699,8 @@ class Dispatcher(BaseHandler): conf.field_map[tmp[0]] = tmp[0] else: conf.field_map[tmp[0]] = tmp[1] + # encoding validator + conf.encoding = self.args.get('encoding') return conf def get_arg(self, name, value_list, default = None): @@ -718,6 +778,8 @@ class Dispatcher(BaseHandler): if dst not in self.row_handler.table_map: self.row_handler.add_table(dst, LOADERS[self.conf.load_mode], self.pkeys, self.conf) + if self.encoding_validator: + data = self.encoding_validator.validate_dict(data) self.row_handler.process(dst, op, data) #BaseHandler.process_event(self, ev, sql_queue_func, arg) @@ -800,11 +862,47 @@ class Dispatcher(BaseHandler): exec_with_vals(self.conf.post_part) self.log.info("Created table: %s" % dst) + def real_copy(self, tablename, src_curs, dst_curs, column_list, cond_list): + """do actual table copy and return tuple with number of bytes and rows + copyed + """ + _src_cols = _dst_cols = column_list + _write_hook = None + condition = ' and '.join(cond_list) + + if self.conf.skip_fields: + _src_cols = [col for col in column_list + if col not in self.conf.skip_fields] + _dst_cols = _src_cols + if self.conf.field_map: + _src_cols = [col for col in _src_cols if col in self.conf.field_map] + _dst_cols = [self.conf.field_map[col] for col in _src_cols] + + if self.encoding_validator: + def _write_hook(obj, data): + return self.encoding_validator.validate(data, _src_cols) + + return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, + condition, self.table_name, _dst_cols, + write_hook = _write_hook) + + + +#------------------------------------------------------------------------------ # register handler class +#------------------------------------------------------------------------------ + + __londiste_handlers__ = [Dispatcher] + + +#------------------------------------------------------------------------------ # helper function for creating dispachers with different default values +#------------------------------------------------------------------------------ + + def handler(name): def wrapper(func): def _init_override(self, table_name, args, log): @@ -818,12 +916,20 @@ def handler(name): return func return wrapper -def dupd(*p): + +def update(*p): """ Update dicts given in params with its precessor param dict in reverse order """ return reduce(lambda x, y: x.update(y) or x, (p[i] for i in range(len(p)-1,-1,-1)), {}) + + +#------------------------------------------------------------------------------ +# build set of handlers with different default values for easier use +#------------------------------------------------------------------------------ + + LOAD = { '': { 'load_mode': 'direct' }, 'bulk': { 'load_mode': 'bulk' } } @@ -841,17 +947,19 @@ BASE = { 'table_mode': 'part', 'row_mode': 'keep_latest', } -# build set of handlers with different default values for easier use for load, load_dict in LOAD.items(): for period, period_dict in PERIOD.items(): for mode, mode_dict in MODE.items(): # define creator func to keep default dicts in separate context def create_handler(): handler_name = '_'.join(p for p in (load, period, mode) if p) - default = dupd(mode_dict, period_dict, load_dict, BASE) + default = update(mode_dict, period_dict, load_dict, BASE) @handler(handler_name) def handler_func(args): - return dupd(args, default) + return update(args, default) create_handler() -# TODO: bulk & ignore handlers + +@handler('bulk_direct') +def bulk_direct_handler(args): + return update(args, {'load_mode': 'bulk', 'table_mode': 'direct'}) diff --git a/python/londiste/table_copy.py b/python/londiste/table_copy.py index 5e10c889..447047e9 100644 --- a/python/londiste/table_copy.py +++ b/python/londiste/table_copy.py @@ -205,9 +205,7 @@ class CopyTable(Replicator): cond = tbl_stat.table_attrs.get('copy_condition') if cond: cond_list.append(cond) - p.prepare_copy(cond_list, dstcurs) - w_cond = ' and '.join(cond_list) - stats = skytools.full_copy(tablename, srccurs, dstcurs, col_list, w_cond) + stats = p.real_copy(tablename, srccurs, dstcurs, col_list, cond_list) if stats: self.log.info("%s: copy finished: %d bytes, %d rows" % ( tablename, stats[0], stats[1])) diff --git a/python/skytools/sqltools.py b/python/skytools/sqltools.py index b8a179be..97043231 100644 --- a/python/skytools/sqltools.py +++ b/python/skytools/sqltools.py @@ -331,18 +331,28 @@ def magic_insert(curs, tablename, data, fields = None, use_insert = 0, quoted_ta class CopyPipe(object): "Splits one big COPY to chunks." - def __init__(self, dstcurs, tablename = None, limit = 512*1024, cancel_func=None, sql_from = None): + def __init__(self, dstcurs, tablename = None, limit = 512*1024, + sql_from = None): self.tablename = tablename self.sql_from = sql_from self.dstcurs = dstcurs self.buf = StringIO() self.limit = limit - self.cancel_func = None + #hook for new data, hook func should return new data + #def write_hook(obj, data): + # return data + self.write_hook = None + #hook for flush, hook func result is discarded + # def flush_hook(obj): + # return None + self.flush_hook = None self.total_rows = 0 self.total_bytes = 0 def write(self, data): "New data from psycopg" + if self.write_hook: + data = self.write_hook(self, data) self.total_bytes += len(data) self.total_rows += data.count("\n") @@ -363,8 +373,8 @@ class CopyPipe(object): def flush(self): "Send data out." - if self.cancel_func: - self.cancel_func() + if self.flush_hook: + self.flush_hook(self) if self.buf.tell() <= 0: return @@ -377,8 +387,10 @@ class CopyPipe(object): self.buf.seek(0) self.buf.truncate() + def full_copy(tablename, src_curs, dst_curs, column_list = [], condition = None, - dst_tablename = None, dst_column_list = None): + dst_tablename = None, dst_column_list = None, + write_hook = None, flush_hook = None): """COPY table from one db to another.""" # default dst table and dst columns to source ones @@ -413,12 +425,16 @@ def full_copy(tablename, src_curs, dst_curs, column_list = [], condition = None, sql_to = "COPY %s TO stdout" % src sql_from = "COPY %s FROM stdin" % dst buf = CopyPipe(dst_curs, sql_from = sql_from) + buf.write_hook = write_hook + buf.flush_hook = flush_hook src_curs.copy_expert(sql_to, buf) else: if condition: # regular psycopg copy_to generates invalid sql for subselect copy raise Exception('copy_expert() is needed for conditional copy') buf = CopyPipe(dst_curs, dst) + buf.write_hook = write_hook + buf.flush_hook = flush_hook src_curs.copy_to(buf, src) buf.flush() @@ -601,7 +617,7 @@ def mk_delete_sql(row, tbl, pkey_list, field_map = None): col = skytools.quote_ident(new_k) val = skytools.quote_literal(row[k]) whe_list.append("%s = %s" % (col, val)) - whe_str = " and ".join(whe_list) + whe_str = " and ".join(whe_list) return "delete from only %s where %s;" % (skytools.quote_fqident(tbl), whe_str) if __name__ == '__main__': diff --git a/tests/handler/init.sh b/tests/handler/init.sh index a0eb185c..1ac0b9d4 100755 --- a/tests/handler/init.sh +++ b/tests/handler/init.sh @@ -2,14 +2,13 @@ . ../env.sh -lst="hsrc hdst" - -for db in $lst; do +for db in hsrc hdst; do echo dropdb $db dropdb $db done -for db in $lst; do - echo createdb $db - createdb $db -done +echo createdb hsrc +createdb hsrc --encoding=sql_ascii --template=template0 + +echo createdb hdst +createdb hdst --encoding=utf-8 --template=template0 diff --git a/tests/handler/regen.sh b/tests/handler/regen.sh index a5ccb705..182c27c3 100755 --- a/tests/handler/regen.sh +++ b/tests/handler/regen.sh @@ -72,16 +72,19 @@ done msg "Create table on root node and fill couple of rows" run_sql hsrc "create table mytable (id int4 primary key, data text, tstamp timestamptz default now())" -for n in 1 2 3 4; do +for n in 1 2 3; do run_sql hsrc "insert into mytable values ($n, 'row$n')" done +msg "Insert row with encoding error" +run_sql hsrc "insert into mytable values(4, E'row\xab4')" + msg "Register table on root node" -run londiste3 $v conf/londiste_hsrc.ini add-table mytable --handler="bulk(method=$meth)" +run londiste3 $v conf/londiste_hsrc.ini add-table mytable msg "Register table on other node with creation" for db in hdst; do - run londiste3 $v conf/londiste_$db.ini add-table mytable --create-only=pkey --handler="bulk(method=$meth)" + run londiste3 $v conf/londiste_$db.ini add-table mytable --create --handler=bulk_direct --handler-arg="method=$meth" --handler-arg="encoding=utf8" done msg "Wait until table is in sync" @@ -106,6 +109,9 @@ run_sql hsrc "delete from mytable where id = 7" run_sql hsrc "delete from mytable where id = 1" run_sql hsrc "update mytable set data = 'row2x' where id = 2" +# row with error +run_sql hsrc "insert into mytable values(8, E'row8\xaf')" + run sleep 5 msg "Check status" diff --git a/tests/noqueue_merge/regen.sh b/tests/noqueue_merge/regen.sh index d67bc43c..93c47320 100755 --- a/tests/noqueue_merge/regen.sh +++ b/tests/noqueue_merge/regen.sh @@ -123,13 +123,18 @@ for n in 1 2 3 4; do run_sql part$n "insert into mydata values ($n, 'part$n')" done +msg "Sleep a bit" +run sleep 10 + msg "Create table and register it in full nodes" for db in $full_list; do job=l3_part1_q_${db} - run londiste3 $v conf/$job.ini add-table mydata --create - for src in $part_list; do - run londiste3 $v conf/l3_${src}_q_${db}.ini add-table mydata - done + run_sql $db "select * from londiste.table_info order by queue_name" + run londiste3 $v conf/$job.ini add-table mydata --create --merge-all + run_sql $db "select * from londiste.table_info order by queue_name" + #for src in $part_list; do + # run londiste3 $v conf/l3_${src}_q_${db}.ini add-table mydata + #done done msg "Sleep a bit" |