client/collectors/postgres.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

import csv
import multiprocessing
import os
import psycopg2
import psycopg2.extras
import time

from multiprocessing import Process, Queue
from utils.logging import log


class PostgresCollector(object):
    """
    collects basic PostgreSQL-level statistics (bgwriter, databases, tables,
    indexes)
    """

    def __init__(self, dbname):
        self._dbname = dbname

    def start(self):
        self._in_queue = Queue()
        self._out_queue = Queue()
        self._worker = Process(target=run_collector,
                               args=(self._in_queue, self._out_queue,
                                     self._dbname))
        self._worker.start()

    def stop(self):

        # signal the worker process to stop by writing a value into the queue
        self._in_queue.put(True)

        log("stopping the PostgreSQL statistics collector")

        # Wait for collector to place result into the output queue. This needs
        # to happen before calling join() otherwise it causes a deadlock.
        log("waiting for collector result in a queue")
        self._result = self._out_queue.get()

        # And wait for the worker to terminate. This should be pretty fast as
        # the collector places result into the queue right before terminating.
        log("waiting for collector process to terminate")
        self._worker.join()

        self._worker = None
        self._in_queue = None
        self._out_queue = None

    def result(self):
        return self._result


def run_collector(in_queue, out_queue, dbname, interval=1.0):
    """
    collector code for a separate process, communicating through a pair of
    queues
    """

    bgwriter_log = None
    tables_log = None
    indexes_log = None
    database_log = None

    # get current timestamp
    ts = time.time()

    while True:

        # wait until the next tick
        ts += interval

        # if we're behind, skip forward
        if ts < time.time():
            continue

        # sleep (but only for the remaining time, to prevent drift)
        time.sleep(ts - time.time())

        # if we've received message in the input queue (not empty), terminate
        if not in_queue.empty():
            log("PostgreSQL collector received request to terminate")
            break

        # open connection to the benchmark database (if can't open, continue)
        # notice this is intentionally after the wait, so we'll wait before
        # next connection attempt
        try:
            conn = psycopg2.connect('host=localhost dbname=%s' % (dbname,))
            cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
        except Exception as ex:
            continue

        # background writer stats
        cur.execute('SELECT EXTRACT(EPOCH FROM now()) AS ts, * '
                    'FROM pg_stat_bgwriter')

        # on the first iteration, construct the CSV files
        if bgwriter_log is None:
            fields = [desc[0] for desc in cur.description]
            bgwriter_log = csv.DictWriter(open('bgwriter.csv', 'w'), fields)
            bgwriter_log.writeheader()

        bgwriter_log.writerows(cur.fetchall())

        # TODO we can assume statistics for most objects (tables, indexes)
        # won't change every second, so we can optimize the amount of data by
        # detecting changes and only keeping the two rows next to it

        # table statistics
        cur.execute('SELECT EXTRACT(EPOCH FROM now()) AS ts, * '
                    'FROM pg_stat_all_tables JOIN pg_statio_all_tables '
                    'USING (relid, schemaname, relname)')

        # on the first iteration, construct the CSV files
        if tables_log is None:
            fields = [desc[0] for desc in cur.description]
            tables_log = csv.DictWriter(open('tables.csv', 'w'), fields)
            tables_log.writeheader()

        tables_log.writerows(cur.fetchall())

        # index statistics
        cur.execute('SELECT EXTRACT(EPOCH FROM now()) AS ts, * '
                    'FROM pg_stat_all_indexes JOIN pg_statio_all_indexes '
                    'USING (relid, indexrelid, schemaname, relname, '
                    'indexrelname)')

        # on the first iteration, construct the CSV files
        if indexes_log is None:
            fields = [desc[0] for desc in cur.description]
            indexes_log = csv.DictWriter(open('indexes.csv', 'w'), fields)
            indexes_log.writeheader()

        indexes_log.writerows(cur.fetchall())

        # database statistics
        cur.execute('SELECT EXTRACT(EPOCH FROM now()) AS ts, * '
                    'FROM pg_stat_database')

        # on the first iteration, construct the CSV files
        if database_log is None:
            fields = [desc[0] for desc in cur.description]
            database_log = csv.DictWriter(open('database.csv', 'w'), fields)
            database_log.writeheader()

        database_log.writerows(cur.fetchall())

        conn.close()

    log("PostgreSQL collector generates CSV results")

    # close the CSV writers
    bgwriter_log = None
    tables_log = None
    indexes_log = None
    database_log = None

    result = {}

    for file in ['bgwriter', 'tables', 'indexes', 'database']:
        if os.path.isfile(''.join([file, '.csv'])):
            with open(''.join([file, '.csv']), 'r') as f:
                result.update({file : f.read()})

                # remove the files
                os.remove(''.join([file, '.csv']))

    out_queue.put(result)

    log("PostgreSQL collector put results into output queue and terminates")