summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEgon Valdmees2011-08-05 10:48:47 +0000
committerMarko Kreen2011-09-16 10:34:46 +0000
commitc2f7264869cc2bc5ffdde272ea20f441b7d884e2 (patch)
treebcb3f918644e507da0a55ee18c6a0586badc755e
parent9bc1a8a9615263a77e6b733a439dd9f8e40b48c8 (diff)
parallel copy process limit
added max-parallel-copy londiste argument to specify max number of parallel copy processes
-rwxr-xr-xpython/londiste.py2
-rw-r--r--python/londiste/playback.py41
-rw-r--r--python/londiste/setup.py23
-rw-r--r--python/londiste/table_copy.py16
-rw-r--r--sql/londiste/functions/londiste.get_table_list.sql26
-rw-r--r--sql/londiste/functions/londiste.local_add_table.sql9
-rw-r--r--sql/londiste/structure/tables.sql2
-rw-r--r--upgrade/src/londiste.table_info.sql2
8 files changed, 84 insertions, 37 deletions
diff --git a/python/londiste.py b/python/londiste.py
index 778b9f78..0b938dbe 100755
--- a/python/londiste.py
+++ b/python/londiste.py
@@ -132,6 +132,8 @@ class Londiste(skytools.DBScript):
help="merge tables from all source queues", default=False)
g.add_option("--no-merge", action="store_true",
help="don't merge tables from source queues", default=False)
+ g.add_option("--max-parallel-copy", type = "int",
+ help="max number of parallel copy processes")
p.add_option_group(g)
diff --git a/python/londiste/playback.py b/python/londiste/playback.py
index d1d3dc27..03afe3e4 100644
--- a/python/londiste/playback.py
+++ b/python/londiste/playback.py
@@ -25,6 +25,8 @@ SYNC_OK = 0 # continue with batch
SYNC_LOOP = 1 # sleep, try again
SYNC_EXIT = 2 # nothing to do, exit skript
+MAX_PARALLEL_COPY = 8 # default number of allowed max parallel copy processes
+
class Counter(object):
"""Counts table statuses."""
@@ -51,6 +53,7 @@ class Counter(object):
elif t.state == TABLE_OK:
self.ok += 1
+
def get_copy_count(self):
return self.copy + self.catching_up + self.wanna_sync + self.do_sync
@@ -74,6 +77,10 @@ class TableState(object):
self.plugin = None
# except this
self.changed = 0
+ # position in parallel copy work order
+ self.copy_pos = 0
+ # max number of parallel copy processesses allowed
+ self.max_parallel_copy = MAX_PARALLEL_COPY
def forget(self):
"""Reset all info."""
@@ -87,6 +94,8 @@ class TableState(object):
self.table_attrs = {}
self.changed = 1
self.plugin = None
+ self.copy_pos = 0
+ self.max_parallel_copy = MAX_PARALLEL_COPY
def change_snapshot(self, str_snapshot, tag_changed = 1):
"""Set snapshot."""
@@ -175,10 +184,18 @@ class TableState(object):
if row['merge_state'] == "?":
self.changed = 1
+ self.copy_pos = int(row.get('copy_pos','0'))
+ self.max_parallel_copy = int(self.table_attrs.get('max_parallel_copy',
+ self.max_parallel_copy))
+
hstr = self.table_attrs.get('handlers', '') # compat
hstr = self.table_attrs.get('handler', hstr)
self.plugin = build_handler(self.name, hstr, self.log)
+ def max_parallel_copies_reached(self):
+ return self.max_parallel_copy and\
+ self.copy_pos >= self.max_parallel_copy
+
def interesting(self, ev, tick_id, copy_thread):
"""Check if table wants this event."""
@@ -210,7 +227,7 @@ class TableState(object):
def gc_snapshot(self, copy_thread, prev_tick, cur_tick, no_lag):
"""Remove attached snapshot if possible.
-
+
If the event processing is in current moment. the snapshot
is not needed beyond next batch.
@@ -318,7 +335,7 @@ class Replicator(CascadedWorker):
self.code_check_done = 1
self.sync_database_encodings(src_db, dst_db)
-
+
self.cur_tick = self.batch_info['tick_id']
self.prev_tick = self.batch_info['prev_tick_id']
@@ -367,7 +384,7 @@ or (ev_extra1 in (%s)))
def sync_tables(self, src_db, dst_db):
"""Table sync loop.
-
+
Calls appropriate handles, which is expected to
return one of SYNC_* constants."""
@@ -395,7 +412,7 @@ or (ev_extra1 in (%s)))
dst_db.commit()
self.load_table_state(dst_db.cursor())
dst_db.commit()
-
+
dsync_backup = None
def sync_from_main_thread(self, cnt, src_db, dst_db):
"Main thread sync logic."
@@ -403,7 +420,7 @@ or (ev_extra1 in (%s)))
# This operates on all table, any amount can be in any state
ret = SYNC_OK
-
+
if cnt.do_sync:
# wait for copy thread to catch up
ret = SYNC_LOOP
@@ -470,7 +487,7 @@ or (ev_extra1 in (%s)))
# there cannot be interesting events in current batch
# but maybe there's several tables, lets do them in one go
ret = SYNC_LOOP
-
+
return ret
@@ -506,7 +523,7 @@ or (ev_extra1 in (%s)))
elif t.state == TABLE_CATCHING_UP:
# partition merging
- if t.copy_role == 'wait-replay':
+ if t.copy_role in ('wait-replay', 'lead'):
return SYNC_LOOP
# is there more work?
@@ -566,14 +583,14 @@ or (ev_extra1 in (%s)))
if not t or not t.interesting(ev, self.cur_tick, self.copy_thread):
self.stat_increase('ignored_events')
return
-
+
try:
p = self.used_plugins[ev.extra1]
except KeyError:
p = t.get_plugin()
self.used_plugins[ev.extra1] = p
p.prepare_batch(self.batch_info, dst_curs)
-
+
p.process_event(ev, self.apply_sql, dst_curs)
def handle_truncate_event(self, ev, dst_curs):
@@ -675,7 +692,7 @@ or (ev_extra1 in (%s)))
def load_table_state(self, curs):
"""Load table state from database.
-
+
Todo: if all tables are OK, there is no need
to load state on every batch.
"""
@@ -823,7 +840,7 @@ or (ev_extra1 in (%s)))
q2 = "select londiste.restore_table_fkey(%(from_table)s, %(fkey_name)s)"
dst_curs.execute(q2, row)
dst_db.commit()
-
+
def drop_fkeys(self, dst_db, table_name):
"""Drop all foreign keys to and from this table.
@@ -839,7 +856,7 @@ or (ev_extra1 in (%s)))
q2 = "select londiste.drop_table_fkey(%(from_table)s, %(fkey_name)s)"
dst_curs.execute(q2, row)
dst_db.commit()
-
+
def process_root_node(self, dst_db):
"""On root node send seq changes to queue."""
diff --git a/python/londiste/setup.py b/python/londiste/setup.py
index 1bfd7039..6ec8ea60 100644
--- a/python/londiste/setup.py
+++ b/python/londiste/setup.py
@@ -69,6 +69,9 @@ class LondisteSetup(CascadeAdmin):
help="merge tables from all source queues", default=False)
p.add_option("--no-merge", action="store_true",
help="don't merge tables from source queues", default=False)
+ p.add_option("--max-parallel-copy", type = "int",
+ help="max number of parallel copy processes")
+
return p
def extra_init(self, node_type, node_db, provider_db):
@@ -183,19 +186,25 @@ class LondisteSetup(CascadeAdmin):
if self.options.expect_sync:
tgargs.append('expect_sync')
- # actual table registration
- q = "select * from londiste.local_add_table(%s, %s, %s)"
- self.exec_cmd(dst_curs, q, [self.set_name, tbl, tgargs])
-
if not self.options.expect_sync:
if self.options.skip_truncate:
attrs['skip_truncate'] = 1
if self.options.copy_condition:
attrs['copy_condition'] = self.options.copy_condition
+
+ if self.options.max_parallel_copy:
+ attrs['max_parallel_copy'] = self.options.max_parallel_copy
+
+ args = [self.set_name, tbl, tgargs]
+
if attrs:
- enc_attrs = skytools.db_urlencode(attrs)
- q = "select * from londiste.local_set_table_attrs(%s, %s, %s)"
- self.exec_cmd(dst_curs, q, [self.set_name, tbl, enc_attrs])
+ args.append(skytools.db_urlencode(attrs))
+
+ q = "select * from londiste.local_add_table(%s)" %\
+ ','.join(['%s']*len(args))
+
+ # actual table registration
+ self.exec_cmd(dst_curs, q, args)
dst_db.commit()
def handler_needs_table(self):
diff --git a/python/londiste/table_copy.py b/python/londiste/table_copy.py
index 7422b533..257dbe60 100644
--- a/python/londiste/table_copy.py
+++ b/python/londiste/table_copy.py
@@ -56,8 +56,14 @@ class CopyTable(Replicator):
src_curs = src_db.cursor()
dst_curs = dst_db.cursor()
- while tbl_stat.copy_role == 'wait-copy':
- self.log.info('waiting for first partition to initialize copy')
+ while 1:
+ if tbl_stat.copy_role == 'wait-copy':
+ self.log.info('waiting for first partition to initialize copy')
+ elif tbl_stat.max_parallel_copies_reached():
+ self.log.info('number of max parallel copies (%s) reached' %\
+ tbl_stat.max_parallel_copy)
+ else:
+ break
time.sleep(10)
tbl_stat = self.reload_table_stat(dst_curs, tbl_stat.name)
dst_db.commit()
@@ -70,7 +76,7 @@ class CopyTable(Replicator):
pt = pmap[tbl_stat.name]
if pt.state == TABLE_OK:
break
-
+
self.log.warning("table %s not in sync yet on provider, waiting" % tbl_stat.name)
time.sleep(10)
@@ -98,7 +104,7 @@ class CopyTable(Replicator):
# find dst struct
src_struct = TableStruct(src_curs, tbl_stat.name)
dst_struct = TableStruct(dst_curs, tbl_stat.name)
-
+
# take common columns, warn on missing ones
dlist = dst_struct.get_column_list()
slist = src_struct.get_column_list()
@@ -154,6 +160,8 @@ class CopyTable(Replicator):
dst_struct.create(dst_curs, objs, log = self.log)
elif cmode == 2:
dst_db.commit()
+ self.change_table_state(dst_db, tbl_stat, TABLE_CATCHING_UP)
+ # start waiting for other copy processes to finish
while tbl_stat.copy_role == 'lead':
self.log.info('waiting for other partitions to finish copy')
time.sleep(10)
diff --git a/sql/londiste/functions/londiste.get_table_list.sql b/sql/londiste/functions/londiste.get_table_list.sql
index 2f526eeb..d101f861 100644
--- a/sql/londiste/functions/londiste.get_table_list.sql
+++ b/sql/londiste/functions/londiste.get_table_list.sql
@@ -7,8 +7,9 @@ create or replace function londiste.get_table_list(
out custom_snapshot text,
out table_attrs text,
out dropped_ddl text,
- out copy_role text)
-returns setof record as $$
+ out copy_role text,
+ out copy_pos int)
+returns setof record as $$
-- ----------------------------------------------------------------------
-- Function: londiste.get_table_list(1)
--
@@ -25,6 +26,7 @@ returns setof record as $$
-- table_attrs - urlencoded dict of table attributes
-- dropped_ddl - partition combining: temp place to put DDL
-- copy_role - partition combining: how to handle copy
+-- copy_pos - position in parallel copy working order
--
-- copy_role = lead:
-- on copy start, drop indexes and store in dropped_ddl
@@ -42,12 +44,15 @@ declare
n_parts int4;
n_done int4;
var_table_name text;
+ n_combined_queue text;
begin
- for var_table_name, local, merge_state, custom_snapshot, table_attrs, dropped_ddl, q_part1, n_parts, n_done in
+ for var_table_name, local, merge_state, custom_snapshot, table_attrs, dropped_ddl, q_part1, n_parts, n_done, n_combined_queue, copy_pos in
select t.table_name, t.local, t.merge_state, t.custom_snapshot, t.table_attrs, t.dropped_ddl,
- min(t2.queue_name) as _queue1,
- count(t2.table_name) as _total,
- count(nullif(t2.merge_state, 'in-copy')) as _done
+ min(case when t2.local then t2.queue_name else null end) as _queue1,
+ count(case when t2.local then t2.table_name else null end) as _total,
+ count(case when t2.local then nullif(t2.merge_state, 'in-copy') else null end) as _done,
+ min(n.combined_queue) as _combined_queue,
+ count(nullif(t2.queue_name < i_queue_name and t.merge_state = 'in-copy' and t2.merge_state = 'in-copy', false)) as _copy_pos
from londiste.table_info t
join pgq_node.node_info n on (n.queue_name = t.queue_name)
left join pgq_node.node_info n2 on (n2.combined_queue = n.combined_queue or
@@ -61,12 +66,13 @@ begin
-- if the table is in middle of copy from multiple partitions,
-- the copy processes need coordination
copy_role := null;
+
if q_part1 is not null then
if i_queue_name = q_part1 then
-- lead
- if merge_state = 'in-copy' then
+ if merge_state in ('in-copy', 'catching-up') then
-- show copy_role only if need to wait for others
- if n_done < n_parts - 1 then
+ if n_done < n_parts then
copy_role := 'lead';
end if;
end if;
@@ -93,8 +99,8 @@ begin
end if;
table_name:=var_table_name;
return next;
- end loop;
+ end loop;
return;
-end;
+end;
$$ language plpgsql strict stable;
diff --git a/sql/londiste/functions/londiste.local_add_table.sql b/sql/londiste/functions/londiste.local_add_table.sql
index 3bfcfebf..78ffad6f 100644
--- a/sql/londiste/functions/londiste.local_add_table.sql
+++ b/sql/londiste/functions/londiste.local_add_table.sql
@@ -2,6 +2,7 @@ create or replace function londiste.local_add_table(
in i_queue_name text,
in i_table_name text,
in i_trg_args text[],
+ in i_table_attrs text default null,
out ret_code int4,
out ret_note text)
as $$
@@ -211,7 +212,8 @@ begin
update londiste.table_info
set local = true,
- merge_state = new_state
+ merge_state = new_state,
+ table_attrs = coalesce(i_table_attrs, table_attrs)
where queue_name = i_queue_name and table_name = fq_table_name;
if not found then
raise exception 'lost table: %', fq_table_name;
@@ -252,7 +254,8 @@ begin
update londiste.table_info
set local = true,
- merge_state = new_state
+ merge_state = new_state,
+ table_attrs = coalesce(i_table_attrs, table_attrs)
where queue_name = _queue_name and table_name = fq_table_name;
if not found then
raise exception 'lost table: %', fq_table_name;
@@ -266,7 +269,7 @@ begin
fq_table_name,
'skip_truncate=1');
end if;
-
+
-------- TRIGGER LOGIC
-- new trigger
diff --git a/sql/londiste/structure/tables.sql b/sql/londiste/structure/tables.sql
index 020acaa6..ddfd880a 100644
--- a/sql/londiste/structure/tables.sql
+++ b/sql/londiste/structure/tables.sql
@@ -99,7 +99,7 @@ create table londiste.table_info (
foreign key (queue_name)
references pgq_node.node_info (queue_name)
on delete cascade,
- check (dropped_ddl is null or merge_state = 'in-copy')
+ check (dropped_ddl is null or merge_state in ('in-copy', 'catching-up'))
);
diff --git a/upgrade/src/londiste.table_info.sql b/upgrade/src/londiste.table_info.sql
new file mode 100644
index 00000000..b199bfd6
--- /dev/null
+++ b/upgrade/src/londiste.table_info.sql
@@ -0,0 +1,2 @@
+ALTER TABLE londiste.table_info DROP CONSTRAINT table_info_check;
+ALTER TABLE londiste.table_info ADD CHECK (dropped_ddl is null or merge_state in ('in-copy', 'catching-up'));