""" Bulk loading into OLAP database. To use set in londiste.ini: handler_modules = londiste.handlers.bulk then add table with: londiste3 add-table xx --handler="bulk" or: londiste3 add-table xx --handler="bulk(method=X)" Methods: 0 (correct) - inserts as COPY into table, update as COPY into temp table and single UPDATE from there delete as COPY into temp table and single DELETE from there 1 (delete) - as 'correct', but do update as DELETE + COPY 2 (merged) - as 'delete', but merge insert rows with update rows Default is 0. """ import skytools from londiste.handler import BaseHandler, RowCache from skytools import quote_ident, quote_fqident __all__ = ['BulkLoader'] # BulkLoader load method METH_CORRECT = 0 METH_DELETE = 1 METH_MERGED = 2 DEFAULT_METHOD = METH_CORRECT # BulkLoader hacks AVOID_BIZGRES_BUG = 0 USE_LONGLIVED_TEMP_TABLES = True USE_REAL_TABLE = False class BulkEvent(object): """Helper class for BulkLoader to store relevant data.""" __slots__ = ('op', 'data', 'pk_data') def __init__(self, op, data, pk_data): self.op = op self.data = data self.pk_data = pk_data class BulkLoader(BaseHandler): """Bulk loading into OLAP database. Instead of statement-per-event, load all data with one big COPY, UPDATE or DELETE statement. Parameters: method=TYPE - method to use for copying [0..2] (default: 0) Methods: 0 (correct) - inserts as COPY into table, update as COPY into temp table and single UPDATE from there delete as COPY into temp table and single DELETE from there 1 (delete) - as 'correct', but do update as DELETE + COPY 2 (merged) - as 'delete', but merge insert rows with update rows """ handler_name = 'bulk' fake_seq = 0 def __init__(self, table_name, args, dest_table): """Init per-batch table data cache.""" BaseHandler.__init__(self, table_name, args, dest_table) self.pkey_list = None self.dist_fields = None self.col_list = None self.pkey_ev_map = {} self.method = int(args.get('method', DEFAULT_METHOD)) if not self.method in (0,1,2): raise Exception('unknown method: %s' % self.method) self.log.debug('bulk_init(%r), method=%d', args, self.method) def reset(self): self.pkey_ev_map = {} BaseHandler.reset(self) def finish_batch(self, batch_info, dst_curs): self.bulk_flush(dst_curs) def process_event(self, ev, sql_queue_func, arg): if len(ev.ev_type) < 2 or ev.ev_type[1] != ':': raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % ( ev.ev_type, ev.ev_extra1, ev.ev_data)) op = ev.ev_type[0] if op not in 'IUD': raise Exception('Unknown event type: '+ev.ev_type) # pkey_list = ev.ev_type[2:].split(',') data = skytools.db_urldecode(ev.ev_data) # get pkey value if self.pkey_list is None: #self.pkey_list = pkey_list self.pkey_list = ev.ev_type[2:].split(',') if len(self.pkey_list) > 0: pk_data = tuple(data[k] for k in self.pkey_list) elif op == 'I': # fake pkey, just to get them spread out pk_data = self.fake_seq self.fake_seq += 1 else: raise Exception('non-pk tables not supported: %s' % self.table_name) # get full column list, detect added columns if not self.col_list: self.col_list = data.keys() elif self.col_list != data.keys(): # ^ supposedly python guarantees same order in keys() self.col_list = data.keys() # keep all versions of row data ev = BulkEvent(op, data, pk_data) if ev.pk_data in self.pkey_ev_map: self.pkey_ev_map[ev.pk_data].append(ev) else: self.pkey_ev_map[ev.pk_data] = [ev] def prepare_data(self): """Got all data, prepare for insertion.""" del_list = [] ins_list = [] upd_list = [] for ev_list in self.pkey_ev_map.itervalues(): # rewrite list of I/U/D events to # optional DELETE and optional INSERT/COPY command exists_before = -1 exists_after = 1 for ev in ev_list: if ev.op == "I": if exists_before < 0: exists_before = 0 exists_after = 1 elif ev.op == "U": if exists_before < 0: exists_before = 1 #exists_after = 1 # this shouldnt be needed elif ev.op == "D": if exists_before < 0: exists_before = 1 exists_after = 0 else: raise Exception('unknown event type: %s' % ev.op) # skip short-lived rows if exists_before == 0 and exists_after == 0: continue # take last event ev = ev_list[-1] # generate needed commands if exists_before and exists_after: upd_list.append(ev.data) elif exists_before: del_list.append(ev.data) elif exists_after: ins_list.append(ev.data) return ins_list, upd_list, del_list def bulk_flush(self, curs): ins_list, upd_list, del_list = self.prepare_data() # reorder cols, put pks first col_list = self.pkey_list[:] for k in self.col_list: if k not in self.pkey_list: col_list.append(k) real_update_count = len(upd_list) self.log.debug("bulk_flush: %s (I/U/D = %d/%d/%d)", self.table_name, len(ins_list), len(upd_list), len(del_list)) # hack to unbroke stuff if self.method == METH_MERGED: upd_list += ins_list ins_list = [] # fetch distribution fields if self.dist_fields is None: self.dist_fields = self.find_dist_fields(curs) key_fields = self.pkey_list[:] for fld in self.dist_fields: if fld not in key_fields: key_fields.append(fld) self.log.debug("PKey fields: %s Dist fields: %s", ",".join(self.pkey_list), ",".join(self.dist_fields)) # create temp table temp, qtemp = self.create_temp_table(curs) tbl = self.dest_table qtbl = self.fq_dest_table # where expr must have pkey and dist fields klist = [] for pk in key_fields: exp = "%s.%s = %s.%s" % (qtbl, quote_ident(pk), qtemp, quote_ident(pk)) klist.append(exp) whe_expr = " and ".join(klist) # create del sql del_sql = "delete from only %s using %s where %s" % (qtbl, qtemp, whe_expr) # create update sql slist = [] for col in col_list: if col not in key_fields: exp = "%s = %s.%s" % (quote_ident(col), qtemp, quote_ident(col)) slist.append(exp) upd_sql = "update only %s set %s from %s where %s" % ( qtbl, ", ".join(slist), qtemp, whe_expr) # avoid updates on pk-only table if not slist: upd_list = [] # insert sql colstr = ",".join([quote_ident(c) for c in col_list]) ins_sql = "insert into %s (%s) select %s from %s" % ( qtbl, colstr, colstr, qtemp) temp_used = False # process deleted rows if len(del_list) > 0: self.log.debug("bulk: Deleting %d rows from %s", len(del_list), tbl) # delete old rows q = "truncate %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) # copy rows self.log.debug("bulk: COPY %d rows into %s", len(del_list), temp) skytools.magic_insert(curs, qtemp, del_list, col_list, quoted_table=1) # delete rows self.log.debug('bulk: %s', del_sql) curs.execute(del_sql) self.log.debug("bulk: %s - %d", curs.statusmessage, curs.rowcount) if len(del_list) != curs.rowcount: self.log.warning("Delete mismatch: expected=%s deleted=%d", len(del_list), curs.rowcount) temp_used = True # process updated rows if len(upd_list) > 0: self.log.debug("bulk: Updating %d rows in %s", len(upd_list), tbl) # delete old rows q = "truncate %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) # copy rows self.log.debug("bulk: COPY %d rows into %s", len(upd_list), temp) skytools.magic_insert(curs, qtemp, upd_list, col_list, quoted_table=1) temp_used = True if self.method == METH_CORRECT: # update main table self.log.debug('bulk: %s', upd_sql) curs.execute(upd_sql) self.log.debug("bulk: %s - %d", curs.statusmessage, curs.rowcount) # check count if len(upd_list) != curs.rowcount: self.log.warning("Update mismatch: expected=%s updated=%d", len(upd_list), curs.rowcount) else: # delete from main table self.log.debug('bulk: %s', del_sql) curs.execute(del_sql) self.log.debug('bulk: %s', curs.statusmessage) # check count if real_update_count != curs.rowcount: self.log.warning("bulk: Update mismatch: expected=%s deleted=%d", real_update_count, curs.rowcount) # insert into main table if AVOID_BIZGRES_BUG: # copy again, into main table self.log.debug("bulk: COPY %d rows into %s", len(upd_list), tbl) skytools.magic_insert(curs, qtbl, upd_list, col_list, quoted_table=1) else: # better way, but does not work due bizgres bug self.log.debug('bulk: %s', ins_sql) curs.execute(ins_sql) self.log.debug('bulk: %s', curs.statusmessage) # process new rows if len(ins_list) > 0: self.log.debug("bulk: Inserting %d rows into %s", len(ins_list), tbl) self.log.debug("bulk: COPY %d rows into %s", len(ins_list), tbl) skytools.magic_insert(curs, qtbl, ins_list, col_list, quoted_table=1) # delete remaining rows if temp_used: if USE_LONGLIVED_TEMP_TABLES or USE_REAL_TABLE: q = "truncate %s" % qtemp else: # fscking problems with long-lived temp tables q = "drop table %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) self.reset() def create_temp_table(self, curs): if USE_REAL_TABLE: tempname = self.dest_table + "_loadertmpx" else: # create temp table for loading tempname = self.dest_table.replace('.', '_') + "_loadertmp" # check if exists if USE_REAL_TABLE: if skytools.exists_table(curs, tempname): self.log.debug("bulk: Using existing real table %s", tempname) return tempname, quote_fqident(tempname) # create non-temp table q = "create table %s (like %s)" % ( quote_fqident(tempname), quote_fqident(self.dest_table)) self.log.debug("bulk: Creating real table: %s", q) curs.execute(q) return tempname, quote_fqident(tempname) elif USE_LONGLIVED_TEMP_TABLES: if skytools.exists_temp_table(curs, tempname): self.log.debug("bulk: Using existing temp table %s", tempname) return tempname, quote_ident(tempname) # bizgres crashes on delete rows # removed arg = "on commit delete rows" arg = "on commit preserve rows" # create temp table for loading q = "create temp table %s (like %s) %s" % ( quote_ident(tempname), quote_fqident(self.dest_table), arg) self.log.debug("bulk: Creating temp table: %s", q) curs.execute(q) return tempname, quote_ident(tempname) def find_dist_fields(self, curs): if not skytools.exists_table(curs, "pg_catalog.gp_distribution_policy"): return [] schema, name = skytools.fq_name_parts(self.dest_table) q = "select a.attname"\ " from pg_class t, pg_namespace n, pg_attribute a,"\ " gp_distribution_policy p"\ " where n.oid = t.relnamespace"\ " and p.localoid = t.oid"\ " and a.attrelid = t.oid"\ " and a.attnum = any(p.attrnums)"\ " and n.nspname = %s and t.relname = %s" curs.execute(q, [schema, name]) res = [] for row in curs.fetchall(): res.append(row[0]) return res # register handler class __londiste_handlers__ = [BulkLoader]