summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Kreen2008-04-17 09:22:06 +0000
committerMarko Kreen2008-04-17 09:22:06 +0000
commit3067f919963b5be1ade551454ea92fbb6914b27e (patch)
tree4e1072a167106250bff65acd707c261cfdbada76
parentb49b5fca2624b47413149581646cfd9a90b68cfe (diff)
more londiste work
- parallel copy - dont error out in 'copy' when columns differ - its add-table problem - test big cascade with lot of tables
-rw-r--r--python/conf/londiste.ini13
-rwxr-xr-xpython/londiste.py6
-rw-r--r--python/londiste/playback.py142
-rw-r--r--python/londiste/table_copy.py74
-rw-r--r--python/pgq/setadmin.py13
-rwxr-xr-xtests/londiste/checkerr.sh4
-rwxr-xr-xtests/londiste/gendb.sh69
-rwxr-xr-xtests/londiste/makenode.sh99
-rwxr-xr-xtests/londiste/stop.sh24
9 files changed, 275 insertions, 169 deletions
diff --git a/python/conf/londiste.ini b/python/conf/londiste.ini
index a1506a32..7be8390e 100644
--- a/python/conf/londiste.ini
+++ b/python/conf/londiste.ini
@@ -1,16 +1,15 @@
[londiste]
-job_name = test_to_subcriber
+job_name = somedb_worker
-provider_db = dbname=provider port=6000 host=127.0.0.1
-subscriber_db = dbname=subscriber port=6000 host=127.0.0.1
+node_db = dbname=somedb host=127.0.0.1
-# it will be used as sql ident so no dots/spaces
-pgq_queue_name = londiste.replika
+set_name = some_set
logfile = ~/log/%(job_name)s.log
pidfile = ~/pid/%(job_name)s.pid
-# both events and ticks will be copied there
-#mirror_queue = replika_mirror
+
+# how many tables can be copied in parallel
+#parallel_copies = 1
diff --git a/python/londiste.py b/python/londiste.py
index e2ac37df..b4678e04 100755
--- a/python/londiste.py
+++ b/python/londiste.py
@@ -121,10 +121,14 @@ class Londiste(skytools.DBScript):
g.add_option("--skip-truncate", action="store_true", dest="skip_truncate",
help = "add: keep old data", default=False)
g.add_option("--provider",
- help = "init: upstream node temp connect string", default=None)
+ help = "init: upstream node temp connect string")
+ g.add_option("--create", action = 'callback', callback = self.opt_create_cb, type='string',
+ help = "add: create table/seq if not exist")
p.add_option_group(g)
return p
+ def opt_create_cb(self, option, opt_str, value, parser):
+ print opt_str, '=', value
if __name__ == '__main__':
script = Londiste(sys.argv[1:])
diff --git a/python/londiste/playback.py b/python/londiste/playback.py
index fdf90c2d..a9599691 100644
--- a/python/londiste/playback.py
+++ b/python/londiste/playback.py
@@ -46,9 +46,9 @@ class Counter(object):
self.do_sync += 1
elif t.state == TABLE_OK:
self.ok += 1
- # only one table is allowed to have in-progress copy
- if self.copy + self.catching_up + self.wanna_sync + self.do_sync > 1:
- raise Exception('Bad table state')
+
+ def get_copy_count(self):
+ return self.copy + self.catching_up + self.wanna_sync + self.do_sync
class TableState(object):
"""Keeps state about one table."""
@@ -263,6 +263,10 @@ class Replicator(pgq.SetConsumer):
self.copy_thread = 0
self.seq_cache = SeqCache()
+ self.parallel_copies = self.cf.getint('parallel_copies', 1)
+ if self.parallel_copies < 1:
+ raise Excpetion('Bad value for parallel_copies: %d' % self.parallel_copies)
+
def process_set_batch(self, src_db, dst_db, ev_list):
"All work for a batch. Entry point from SetConsumer."
@@ -350,56 +354,62 @@ class Replicator(pgq.SetConsumer):
def sync_from_main_thread(self, cnt, src_db, dst_db):
"Main thread sync logic."
- #
- # decide what to do - order is imortant
- #
+ # This operates on all table, any amount can be in any state
+
+ ret = SYNC_OK
+
if cnt.do_sync:
# wait for copy thread to catch up
- return SYNC_LOOP
- elif cnt.wanna_sync:
+ ret = SYNC_LOOP
+
+ for t in self.get_tables_in_state(TABLE_WANNA_SYNC):
# copy thread wants sync, if not behind, do it
- t = self.get_table_by_state(TABLE_WANNA_SYNC)
if self.cur_tick >= t.sync_tick_id:
self.change_table_state(dst_db, t, TABLE_DO_SYNC, self.cur_tick)
- return SYNC_LOOP
- else:
- return SYNC_OK
- elif cnt.catching_up:
- # active copy, dont worry
- return SYNC_OK
- elif cnt.copy:
- # active copy, dont worry
- return SYNC_OK
- elif cnt.missing:
- # seems there is no active copy thread, launch new
- t = self.get_table_by_state(TABLE_MISSING)
-
- # drop all foreign keys to and from this table
- self.drop_fkeys(dst_db, t.name)
-
- # change state after fkeys are dropped thus allowing
- # failure inbetween
- self.change_table_state(dst_db, t, TABLE_IN_COPY)
-
- # the copy _may_ happen immidiately
- self.launch_copy(t)
-
- # there cannot be interesting events in current batch
- # but maybe there's several tables, lets do them in one go
- return SYNC_LOOP
- else:
- # seems everything is in sync
- return SYNC_OK
+ ret = SYNC_LOOP
+
+ npossible = self.parallel_copies - cnt.get_copy_count()
+ if cnt.missing and npossible > 0:
+ pmap = self.get_state_map(src_db.cursor())
+ src_db.commit()
+ for t in self.get_tables_in_state(TABLE_MISSING):
+ if t.name not in pmap:
+ self.log.warning("Table %s not availalbe on provider" % t.name)
+ continue
+ pt = pmap[t.name]
+ if pt.state != TABLE_OK: # or pt.custom_snapshot: # FIXME: does snapsnot matter?
+ self.log.info("Table %s not OK on provider, waiting" % t.name)
+ continue
+
+ # dont allow more copies than configured
+ if npossible == 0:
+ break
+ npossible -= 1
+
+ # drop all foreign keys to and from this table
+ self.drop_fkeys(dst_db, t.name)
+
+ # change state after fkeys are dropped thus allowing
+ # failure inbetween
+ self.change_table_state(dst_db, t, TABLE_IN_COPY)
+
+ # the copy _may_ happen immidiately
+ self.launch_copy(t)
+
+ # there cannot be interesting events in current batch
+ # but maybe there's several tables, lets do them in one go
+ ret = SYNC_LOOP
+
+ return ret
def sync_from_copy_thread(self, cnt, src_db, dst_db):
"Copy thread sync logic."
- #
- # decide what to do - order is important
- #
- if cnt.do_sync:
+ # This operates on single table
+ t = self.table_map[self.copy_table_name]
+
+ if t.state == TABLE_DO_SYNC:
# main thread is waiting, catch up, then handle over
- t = self.get_table_by_state(TABLE_DO_SYNC)
if self.cur_tick == t.sync_tick_id:
self.change_table_state(dst_db, t, TABLE_OK)
return SYNC_EXIT
@@ -409,21 +419,19 @@ class Replicator(pgq.SetConsumer):
self.log.error("copy_sync: cur_tick=%d sync_tick=%d" % (
self.cur_tick, t.sync_tick_id))
raise Exception('Invalid table state')
- elif cnt.wanna_sync:
+ elif t.state == TABLE_WANNA_SYNC:
# wait for main thread to react
return SYNC_LOOP
- elif cnt.catching_up:
+ elif t.state == TABLE_CATCHING_UP:
# is there more work?
if self.work_state:
return SYNC_OK
# seems we have catched up
- t = self.get_table_by_state(TABLE_CATCHING_UP)
self.change_table_state(dst_db, t, TABLE_WANNA_SYNC, self.cur_tick)
return SYNC_LOOP
- elif cnt.copy:
+ elif t.state == TABLE_IN_COPY:
# table is not copied yet, do it
- t = self.get_table_by_state(TABLE_IN_COPY)
self.do_copy(t, src_db, dst_db)
# forget previous value
@@ -515,6 +523,20 @@ class Replicator(pgq.SetConsumer):
self.table_list = new_list
self.table_map = new_map
+ def get_state_map(self, curs):
+ """Get dict of table states."""
+
+ q = "select table_name, custom_snapshot, merge_state, skip_truncate"\
+ " from londiste.node_get_table_list(%s)"
+ curs.execute(q, [self.set_name])
+
+ new_map = {}
+ for row in curs.fetchall():
+ t = TableState(row['table_name'], self.log)
+ t.loaded_state(row['merge_state'], row['custom_snapshot'], row['skip_truncate'])
+ new_map[t.name] = t
+ return new_map
+
def save_table_state(self, curs):
"""Store changed table state in database."""
@@ -539,13 +561,12 @@ class Replicator(pgq.SetConsumer):
self.log.info("Table %s status changed to '%s'" % (
tbl.name, tbl.render_state()))
- def get_table_by_state(self, state):
- "get first table with specific state"
+ def get_tables_in_state(self, state):
+ "get all tables with specific state"
for t in self.table_list:
if t.state == state:
- return t
- raise Exception('No table was found with state: %d' % state)
+ yield t
def get_table_by_name(self, name):
if name.find('.') < 0:
@@ -558,22 +579,21 @@ class Replicator(pgq.SetConsumer):
self.log.info("Launching copy process")
script = sys.argv[0]
conf = self.cf.filename
+ cmd = [script, conf, 'copy', tbl_stat.name, '-d', '-q']
if self.options.verbose:
- cmd = "%s -d -v %s copy"
- else:
- cmd = "%s -d %s copy"
- cmd = cmd % (script, conf)
+ cmd.append('-v')
# let existing copy finish and clean its pidfile,
- # otherwise new copy will exit immidiately
- copy_pidfile = self.pidfile + ".copy"
+ # otherwise new copy will exit immidiately.
+ # FIXME: should not happen on per-table pidfile ???
+ copy_pidfile = "%s.copy.%s" % (self.pidfile, tbl_stat.name)
while os.path.isfile(copy_pidfile):
- self.log.info("Waiting for existing copy to exit")
+ self.log.warning("Waiting for existing copy to exit")
time.sleep(2)
self.log.debug("Launch args: "+repr(cmd))
- res = os.system(cmd)
- self.log.debug("Launch result: "+repr(res))
+ pid = os.spawnvp(os.P_NOWAIT, script, cmd)
+ self.log.debug("Launch result: "+repr(pid))
def sync_database_encodings(self, src_db, dst_db):
"""Make sure client_encoding is same on both side."""
diff --git a/python/londiste/table_copy.py b/python/londiste/table_copy.py
index c712d8bd..b78379d4 100644
--- a/python/londiste/table_copy.py
+++ b/python/londiste/table_copy.py
@@ -16,18 +16,35 @@ class CopyTable(Replicator):
def __init__(self, args, copy_thread = 1):
Replicator.__init__(self, args)
- if copy_thread:
- self.pidfile += ".copy"
- self.consumer_name += "_copy"
- self.copy_thread = 1
- self.main_worker = False
+ if not copy_thread:
+ raise Exception("Combined copy not supported")
+
+ if len(self.args):
+ print "londiste copy requires table name"
+ self.copy_table_name = self.args[2]
+
+ self.pidfile += ".copy.%s" % self.copy_table_name
+ self.consumer_name += "_copy_%s" % self.copy_table_name
+ self.copy_thread = 1
+ self.main_worker = False
def do_copy(self, tbl_stat, src_db, dst_db):
- # it should not matter to pgq
- src_db.commit()
dst_db.commit()
+ while 1:
+ pmap = self.get_state_map(src_db.cursor())
+ src_db.commit()
+ if tbl_stat.name not in pmap:
+ raise Excpetion("table %s not available on provider" % tbl_stat.name)
+ pt = pmap[tbl_stat.name]
+ if pt.state == TABLE_OK:
+ break
+
+ self.log.warning("table %s not in sync yet on provider, waiting" % tbl_stat.name)
+ time.sleep(10)
+
+
# change to SERIALIZABLE isolation level
src_db.set_isolation_level(skytools.I_SERIALIZABLE)
src_db.commit()
@@ -40,22 +57,43 @@ class CopyTable(Replicator):
self.log.info("Starting full copy of %s" % tbl_stat.name)
+ # just in case, drop all fkeys (in case "replay" was skipped)
+ # !! this may commit, so must be done before anything else !!
+ self.drop_fkeys(dst_db, tbl_stat.name)
+
+ # drop own triggers
+ q_node_trg = "select * from londiste.node_disable_triggers(%s, %s)"
+ dst_curs.execute(q_node_trg, [self.set_name, tbl_stat.name])
+
+ # drop rest of the triggers
+ q_triggers = "select londiste.drop_all_table_triggers(%s)"
+ dst_curs.execute(q_triggers, [tbl_stat.name])
+
# find dst struct
src_struct = TableStruct(src_curs, tbl_stat.name)
dst_struct = TableStruct(dst_curs, tbl_stat.name)
- # check if columns match
+ # take common columns, warn on missing ones
dlist = dst_struct.get_column_list()
- for c in src_struct.get_column_list():
+ slist = src_struct.get_column_list()
+ common_cols = []
+ for c in slist:
if c not in dlist:
- raise Exception('Column %s does not exist on dest side' % c)
+ self.log.warning("Table %s column %s does not exist on subscriber"
+ % (tbl_stat.name, c))
+ else:
+ common_cols.append(c)
+ for c in dlist:
+ if c not in slist:
+ self.log.warning("Table %s column %s does not exist on provider"
+ % (tbl_stat.name, c))
# drop unnecessary stuff
objs = T_CONSTRAINT | T_INDEX | T_RULE
dst_struct.drop(dst_curs, objs, log = self.log)
# do truncate & copy
- self.real_copy(src_curs, dst_curs, tbl_stat)
+ self.real_copy(src_curs, dst_curs, tbl_stat, common_cols)
# get snapshot
src_curs.execute("select txid_current_snapshot()")
@@ -66,6 +104,10 @@ class CopyTable(Replicator):
src_db.set_isolation_level(1)
src_db.commit()
+ # restore own triggers
+ q_node_trg = "select * from londiste.node_refresh_triggers(%s, %s)"
+ dst_curs.execute(q_node_trg, [self.set_name, tbl_stat.name])
+
# create previously dropped objects
dst_struct.create(dst_curs, objs, log = self.log)
dst_db.commit()
@@ -79,14 +121,7 @@ class CopyTable(Replicator):
self.save_table_state(dst_curs)
dst_db.commit()
- # if copy done, request immidiate tick from pgqadm,
- # to make state juggling faster. on mostly idle db-s
- # each step may take tickers idle_timeout secs, which is pain.
- q = "select pgq.force_tick(%s)"
- src_curs.execute(q, [self.src_queue.queue_name])
- src_db.commit()
-
- def real_copy(self, srccurs, dstcurs, tbl_stat):
+ def real_copy(self, srccurs, dstcurs, tbl_stat, col_list):
"Main copy logic."
tablename = tbl_stat.name
@@ -99,7 +134,6 @@ class CopyTable(Replicator):
# do copy
self.log.info("%s: start copy" % tablename)
- col_list = skytools.get_table_columns(srccurs, tablename)
stats = skytools.full_copy(tablename, srccurs, dstcurs, col_list)
if stats:
self.log.info("%s: copy finished: %d bytes, %d rows" % (
diff --git a/python/pgq/setadmin.py b/python/pgq/setadmin.py
index d1b10e33..47f70de8 100644
--- a/python/pgq/setadmin.py
+++ b/python/pgq/setadmin.py
@@ -119,7 +119,7 @@ class SetAdmin(skytools.DBScript):
provider_db.commit()
row = curs.fetchone()
if not row:
- raise Exceotion("provider node not found")
+ raise Exception("provider node not found")
provider_name = row['node_name']
# register member on root
@@ -166,6 +166,12 @@ class SetAdmin(skytools.DBScript):
while 1:
db = self.get_database('root_db', connstr = loc)
+
+ if 1:
+ curs = db.cursor()
+ curs.execute("select current_database()")
+ n = curs.fetchone()[0]
+ self.log.debug("real dbname=%s" % n)
# query current status
res = self.exec_query(db, "select * from pgq_set.get_node_info(%s)", [self.set_name])
info = res[0]
@@ -174,12 +180,15 @@ class SetAdmin(skytools.DBScript):
self.log.info("Root node not initialized?")
sys.exit(1)
+ self.log.debug("db='%s' -- type='%s' provider='%s'" % (loc, type, info['provider_location']))
# configured db may not be root anymore, walk upwards then
if type in ('root', 'combined-root'):
db.commit()
return db
- self.close_connection()
+ self.close_database('root_db')
+ if loc == info['provider_location']:
+ raise Exception("find_root_db: got loop: %s" % loc)
loc = info['provider_location']
if loc is None:
self.log.info("Sub node provider not initialized?")
diff --git a/tests/londiste/checkerr.sh b/tests/londiste/checkerr.sh
new file mode 100755
index 00000000..ce26c71a
--- /dev/null
+++ b/tests/londiste/checkerr.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+grep -E 'WARN|ERR|CRIT' sys/log.*
+
diff --git a/tests/londiste/gendb.sh b/tests/londiste/gendb.sh
index 25950d8d..38678bbf 100755
--- a/tests/londiste/gendb.sh
+++ b/tests/londiste/gendb.sh
@@ -2,70 +2,17 @@
. ../env.sh
-contrib=/usr/share/postgresql/8.1/contrib
-contrib=/opt/apps/pgsql-dev/share/contrib
-contrib=/opt/pgsql/share/contrib
-
-set -e
-
-
-mkdir -p sys
./stop.sh
-sleep 1
-
-rm -rf file_logs sys
-mkdir -p sys
+rm -f sys/log.*
-db=db_root
-echo "creating database: $db"
-dropdb $db && sleep 1 || true
-sleep 1
-createdb $db
-londiste.py conf/w_root.ini init-root n_root "dbname=$db"
-pgqadm.py conf/ticker_root.ini install
-psql -q $db -f data.sql
-londiste.py conf/w_root.ini add data1
-londiste.py conf/w_root.ini add data1
-londiste.py conf/w_root.ini remove data1
-londiste.py conf/w_root.ini remove data1
-londiste.py conf/w_root.ini add data1
-londiste.py conf/w_root.ini tables
-
-db=db_branch
-echo "creating database: $db"
-dropdb $db && sleep 1 || true
-createdb $db
-pgqadm.py conf/ticker_branch.ini install
-londiste.py conf/w_branch.ini init-branch n_branch "dbname=$db" --provider="dbname=db_root"
-psql -q $db -f data.sql
-londiste.py conf/w_branch.ini add data1
-londiste.py conf/w_branch.ini add data1
-londiste.py conf/w_branch.ini remove data1
-londiste.py conf/w_branch.ini remove data1
-londiste.py conf/w_branch.ini add data1
-londiste.py conf/w_branch.ini tables
-
-exit 0
-
-db=subscriber
-echo "creating database: $db"
-dropdb $db
-sleep 1
-createdb $db
-pgqadm.py conf/linkticker.ini install
-psql -q $db -f data.sql
+set -e
-db=file_subscriber
-echo "creating database: $db"
-dropdb $db
-sleep 1
-createdb $db
-createlang plpgsql $db
-createlang plpythonu $db
-psql -q $db -f data.sql
-echo "done, testing"
+./makenode.sh test_set root root
-#pgqmgr.py -d conf/ticker.ini ticker
-#./run-tests.sh
+last=root
+for n in `seq 1 10`; do
+ ./makenode.sh test_set node$n branch $last
+ last=node$n
+done
diff --git a/tests/londiste/makenode.sh b/tests/londiste/makenode.sh
new file mode 100755
index 00000000..caf11dab
--- /dev/null
+++ b/tests/londiste/makenode.sh
@@ -0,0 +1,99 @@
+#! /bin/sh
+
+set -e
+
+msg () {
+ echo " *" "$@"
+}
+
+run () {
+ echo "\$ $*"
+ "$@"
+}
+
+
+# usage: makenode <set_name> <base_name> <type> <provider_base_name>
+set_name="$1"
+base_name="$2"
+node_type="$3"
+provider_base_name="$4"
+
+db="db_$base_name"
+connstr="dbname=$db host=127.0.0.1"
+node_name="n_$base_name"
+ticker_conf="sys/ticker_$base_name.ini"
+londiste_conf="sys/worker_$base_name.ini"
+
+for pf in sys/pid.ticker_$base_name \
+ sys/pid.worker_$base_name \
+ sys/pid.worker_$base_name.*
+do
+ test -f $pf || continue
+ msg "Killing $pf"
+ kill `cat $pf`
+ sleep 1
+done
+
+msg "Creating $ticker_conf"
+cat > "$ticker_conf" <<EOF
+[pgqadm]
+job_name = ticker_$base_name
+db = $connstr
+maint_delay_min = 1
+loop_delay = 0.5
+logfile = sys/log.%(job_name)s
+pidfile = sys/pid.%(job_name)s
+use_skylog = 0
+connection_lifetime = 10
+queue_refresh_period = 10
+EOF
+
+msg "Creating $londiste_conf"
+cat > "$londiste_conf" <<EOF
+[londiste]
+job_name = worker_$base_name
+set_name = $set_name
+node_db = $connstr
+pidfile = sys/pid.%(job_name)s
+logfile = sys/log.%(job_name)s
+loop_delay = 1
+connection_lifetime = 10
+parallel_copies = 4
+EOF
+
+
+msg "Dropping & Creating $db"
+dropdb $db 2>&1 | grep -v 'not exist' || true
+createdb $db
+
+msg "Installing pgq"
+pgqadm.py $ticker_conf install
+msg "Launching ticker"
+pgqadm.py $ticker_conf ticker -d
+
+msg "Initializing node"
+run londiste.py $londiste_conf "init-$node_type" "$node_name" "$connstr" -v \
+ --provider="dbname=db_$provider_base_name host=127.0.0.1"
+
+msg "Launching Londiste"
+londiste.py $londiste_conf worker -d -v
+
+for n in `seq 1 16`; do
+ tbl="manytable$n"
+ msg "Creating $tbl on n_$base_name"
+ { psql -q $db 2>&1 | grep -v NOTICE || true ; }<<EOF
+create table $tbl (
+ id serial primary key,
+ txt text not null
+);
+insert into $tbl (txt)
+select '$tbl-$base_name'
+ from generate_series(1, 5 + $n);
+EOF
+
+ msg "Adding $tbl to n_$base_name"
+ londiste.py $londiste_conf add $tbl
+
+done
+
+
diff --git a/tests/londiste/stop.sh b/tests/londiste/stop.sh
index f808281d..2bf9220a 100755
--- a/tests/londiste/stop.sh
+++ b/tests/londiste/stop.sh
@@ -1,21 +1,11 @@
#! /bin/sh
-. ../env.sh
-./testing.py -s conf/tester.ini
-londiste.py -s conf/w_leaf.ini
-londiste.py -s conf/w_branch.ini
-londiste.py -s conf/w_root.ini
-
-sleep 1
-
-pgqadm.py -s conf/ticker_root.ini
-pgqadm.py -s conf/ticker_branch.ini
-pgqadm.py -s conf/ticker_leaf.ini
-
-sleep 1
-
-for f in sys/pid.*; do
- test -f "$f" || continue
- kill `cat $f`
+got=0
+for pf in sys/pid.*; do
+ test -f "$pf" || continue
+ echo " * Killing $pf"
+ kill `cat $pf`
+ got=1
done
+test got = 0 || sleep 1