import math import os import os.path import re import time from numpy import mean, median, std from multiprocessing import cpu_count from utils.logging import log from utils.misc import available_ram, run_cmd class PgBench(object): 'a simple wrapper around pgbench, running TPC-B-like workload by default' # TODO allow running custom scripts, not just the default # read-write/read-only tests # TODO allow running 'prepared' mode def __init__(self, bin_path, dbname, runs=3, duration=60, csv=False, results_dir=None): ''' bin_path - path to PostgreSQL binaries (dropdb, createdb, psql commands) dbname - name of the database to use duration - duration of each execution runs - number of runs (for each client count) out_dir - output directory ''' self._bin = bin_path self._csv = csv self._dbname = dbname self._duration = duration self._outdir = results_dir self._runs = runs self._env = os.environ self._env['PATH'] = ':'.join([bin_path, self._env['PATH']]) self._results = {} @staticmethod def _configure(cpu_count, ram_mbs): 'derive the configurations to benchmark from CPU count and RAM size' config = [] # TODO allow overriding this from a global config # scales: 10 (small), 50% of RAM, 200% of RAM # for s in [10, ram_mbs/15/2, ram_mbs*2/15]: for s in [10]: config.append({'scale': int(math.ceil(s / 10) * 10), 'clients': [1, cpu_count, 2 * cpu_count]}) return config def _init(self, scale): """ recreate the database (drop + create) and populate it with given scale """ # initialize results for this dataset scale self._results['results'] = { 'init': None, 'runs': [], 'warmup': None, } log("recreating '%s' database" % (self._dbname,)) run_cmd(['dropdb', '--if-exists', self._dbname], env=self._env) run_cmd(['createdb', self._dbname], env=self._env) log("initializing pgbench '%s' with scale %s" % (self._dbname, scale)) r = run_cmd(['pgbench', '-i', '-s', str(scale), self._dbname], env=self._env, cwd=self._outdir) # remember the init duration self._results['results']['init'] = r[2] @staticmethod def _parse_results(data): 'extract results (including parameters) from the pgbench output' scale = -1 r = re.search('scaling factor: ([0-9]+)', data) if r: scale = r.group(1) mode = -1 r = re.search('query mode: (.+)', data) if r: mode = r.group(1) clients = -1 r = re.search('number of clients: ([0-9]+)', data) if r: clients = r.group(1) threads = -1 r = re.search('number of threads: ([0-9]+)', data) if r: threads = r.group(1) duration = -1 r = re.search('duration: ([0-9]+) s', data) if r: duration = r.group(1) latency = -1 r = re.search('latency average: ([0-9\.]+) ms', data) if r: latency = r.group(1) tps = -1 r = re.search('tps = ([0-9]+\.[0-9]+) \(excluding connections ' 'establishing\)', data) if r: tps = r.group(1) return {'scale': scale, 'mode': mode, 'clients': clients, 'threads': threads, 'duration': duration, 'latency': latency, 'tps': tps} def check_config(self): 'check pgbench configuration (existence of binaries etc.)' issues = [] if not os.path.isdir(self._bin): issues.append("bin_dir='%s' does not exist" % (self._bin,)) elif not os.path.exists('%s/pgbench' % (self._bin,)): issues.append("pgbench not found in bin_dir='%s'" % (self._bin,)) elif not os.path.exists('%s/createdb' % (self._bin,)): issues.append("createdb not found in bin_dir='%s'" % (self._bin,)) elif not os.path.exists('%s/dropdb' % (self._bin,)): issues.append("dropdb not found in bin_dir='%s'" % (self._bin,)) elif not os.path.exists('%s/psql' % (self._bin,)): issues.append("psql not found in bin_dir='%s'" % (self._bin,)) if type(self._duration) is not int: issues.append("duration (%s) needs to be an integer" % self._duration) elif not self._duration >= 1: issues.append("duration (%s) needs to be >= 1" % (self._duration,)) if type(self._runs) is not int: issues.append("runs (%s) needs to be an integer" % self._duration) elif not self._runs >= 1: issues.append("runs (%s) needs to be >= 1" % (self._runs,)) return issues def _run(self, run, scale, duration, nclients=1, njobs=1, read_only=False, aggregate=True, csv_queue=None): 'run pgbench on the database (either a warmup or actual benchmark run)' # Create a separate directory for each pgbench run if read_only: rtag = "ro" else: rtag = "rw" rdir = "%s/pgbench-%s-%d-%d-%s" % (self._outdir, rtag, scale, nclients, str(run)) os.mkdir(rdir) args = ['pgbench', '-c', str(nclients), '-j', str(njobs), '-T', str(duration)] # aggregate on per second resolution if aggregate: args.extend(['-l', '--aggregate-interval', '1']) if read_only: args.extend(['-S']) args.extend([self._dbname]) # do an explicit checkpoint before each run run_cmd(['psql', self._dbname, '-c', 'checkpoint'], env=self._env) log("pgbench: clients=%d, jobs=%d, aggregate=%s, read-only=%s, " "duration=%d" % (nclients, njobs, aggregate, read_only, duration)) start = time.time() r = run_cmd(args, env=self._env, cwd=rdir) end = time.time() r = PgBench._parse_results(r[1]) r.update({'read-only': read_only}) r.update({'start': start, 'end': end}) if csv_queue is not None: csv_queue.put([start, end, r['scale'], nclients, njobs, mode, duration, latency, tps]) return r def run_tests(self, csv_queue): """ execute the whole benchmark, including initialization, warmup and benchmark runs """ # derive configuration for the CPU count / RAM size configs = PgBench._configure(cpu_count(), available_ram()) results = {'ro': {}, 'rw': {}} j = 0 for config in configs: scale = config['scale'] if scale not in results['ro']: results['ro'][scale] = {} if scale not in results['rw']: results['rw'][scale] = {} # init for the dataset scale and warmup self._init(scale) warmup = self._run('w%d' % j, scale, self._duration, cpu_count(), cpu_count()) j += 1 # read-only & read-write for ro in [True, False]: if ro: tag = 'ro' else: tag = 'rw' for i in range(self._runs): log("pgbench : %s run=%d" % (tag, i)) for clients in config['clients']: if clients not in results[tag][scale]: results[tag][scale][clients] = {} results[tag][scale][clients]['results'] = [] r = self._run(i, scale, self._duration, clients, clients, ro, True, csv_queue) r.update({'run': i}) results[tag][scale][clients]['results'].append(r) tps = [] for result in results[tag][scale][clients]['results']: tps.append(float(result['tps'])) results[tag][scale][clients]['metric'] = mean(tps) results[tag][scale][clients]['median'] = median(tps) results[tag][scale][clients]['std'] = std(tps) self._results['pgbench'] = results return self._results