From 6dc97e125b56ca5ff459c900a9bb34807b8f613e Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Mon, 30 Jun 2025 18:58:58 +0300 Subject: [PATCH] Added more metrics and Average query runtime and Sessions dashboards --- config/grafana/dashboards/dash1.json | 455 +- .../provisioning/datasources/datasources.yml | 11 +- config/pgwatch-prometheus/metrics.yml | 4541 +++++++++++++++++ config/pgwatch-prometheus/sources.yml | 19 +- config/prometheus/prometheus.yml | 8 +- old-metrics.yml | 0 6 files changed, 5020 insertions(+), 14 deletions(-) create mode 100644 old-metrics.yml diff --git a/config/grafana/dashboards/dash1.json b/config/grafana/dashboards/dash1.json index ac64f5d..ba66e0a 100644 --- a/config/grafana/dashboards/dash1.json +++ b/config/grafana/dashboards/dash1.json @@ -17,10 +17,114 @@ }, "editable": true, "fiscalYearStartMonth": 0, - "graphTooltip": 0, + "graphTooltip": 2, "id": 1, "links": [], "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(pgwatch_stat_statements_total_time[$agg_interval])) / sum(rate(pgwatch_stat_statements_calls[$agg_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Average query runtime", + "type": "timeseries" + }, { "datasource": { "type": "datasource", @@ -96,7 +200,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 0 }, "id": 1, @@ -172,10 +276,6 @@ "options": {} }, { - "filter": { - "id": "byRefId", - "options": "/^(?:seriesToRows-B-B-B-B-B-B-B-B-B)$/" - }, "id": "extractFields", "options": { "delimiter": ",", @@ -212,6 +312,309 @@ } ], "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "D" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + }, + { + "id": "custom.stacking", + "value": { + "group": "A", + "mode": "none" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "E" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + }, + { + "id": "custom.stacking", + "value": { + "group": "A", + "mode": "none" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "#4e7299", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#afafaf", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle-in-transaction" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Waiting" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_backends_idleintransaction", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Idle-in-transaction", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_backends_idle", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Idle", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_backends_active", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Active", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_settings_max_connections", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Max connections", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_backends_total", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Total", + "range": true, + "refId": "E", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "pgwatch_backends_waiting ", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Waiting", + "range": true, + "refId": "F", + "useBackend": false + } + ], + "title": "Sessions", + "type": "timeseries" } ], "preload": false, @@ -219,7 +622,43 @@ "schemaVersion": 41, "tags": [], "templating": { - "list": [] + "list": [ + { + "auto": true, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "10s", + "value": "10s" + }, + "name": "agg_interval", + "options": [ + { + "selected": true, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "10s,1m,5m,1h", + "refresh": 2, + "type": "interval" + } + ] }, "time": { "from": "now-5m", @@ -229,5 +668,5 @@ "timezone": "browser", "title": "PoC", "uid": "00eb62a7-4b80-43cd-a890-45336979aa18", - "version": 2 + "version": 16 } \ No newline at end of file diff --git a/config/grafana/provisioning/datasources/datasources.yml b/config/grafana/provisioning/datasources/datasources.yml index 699bbbc..91ca878 100644 --- a/config/grafana/provisioning/datasources/datasources.yml +++ b/config/grafana/provisioning/datasources/datasources.yml @@ -4,6 +4,7 @@ datasources: - name: PGWatch-PostgreSQL type: postgres access: proxy + uid: P031DD592934B2F1F url: sink-postgres:5432 database: measurements user: pgwatch @@ -12,10 +13,16 @@ datasources: jsonData: sslmode: disable postgresVersion: 1500 - isDefault: true + isDefault: false - name: PGWatch-Prometheus type: prometheus access: proxy + uid: P7A0D6631BB10B34F url: http://sink-prometheus:9090 - isDefault: false \ No newline at end of file + isDefault: true + jsonData: + scrapeInterval: '5s' + queryTimeout: '5s' + timeInterval: '5s' + httpMethod: 'POST' \ No newline at end of file diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 492724b..b8fbd61 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -1,4 +1,27 @@ # Simple PGWatch Metrics for Prometheus - just queryid and calls + +# The following structure is expected for metrics and preset definitions: +# metrics: +# metric_name: +# init_sql: |- +# CREATE EXTENSION IF NOT EXISTS some_extension; +# CREATE OR REPLACE FUNCTION get_some_stat(OUT some_stat int) +# ... +# sqls: +# 11: | +# select /* pgwatch_generated */ +# (extract(epoch from now()) * 1e9)::int8 as epoch_ns, +# ... +# 14: | +# select /* pgwatch_generated */ +# (extract(epoch from now()) * 1e9)::int8 as epoch_ns, +# ... +# gauges: +# - '*' +# is_instance_level: true +# node_status: primary +# statement_timeout_seconds: 300 +# metric_storage_name: db_stats metrics: pg_stat_statements_calls: description: "Simple queryid and calls metric" @@ -19,3 +42,4521 @@ metrics: metric_storage_name: pgss_calls node_status: primary statement_timeout_seconds: 5 + + archiver: + description: > + This metric retrieves key statistics from the PostgreSQL `pg_stat_archiver` view providing insights into the status of WAL file archiving. + It returns the total number of successfully archived files and failed archiving attempts. Additionally, it identifies if the most recent + attempt resulted in a failure and calculates how many seconds have passed since the last failure. The metric only considers data if WAL + archiving is enabled in the system, helping administrators monitor and diagnose issues related to the archiving process. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + archived_count, + failed_count, + case when coalesce(last_failed_time, '1970-01-01'::timestamptz) > coalesce(last_archived_time, '1970-01-01'::timestamptz) then 1 else 0 end as is_failing_int, + extract(epoch from now() - last_failed_time)::int8 as seconds_since_last_failure + from + pg_stat_archiver + where + current_setting('archive_mode') in ('on', 'always') + gauges: + - is_failing_int + - seconds_since_last_failure + is_instance_level: true + archiver_pending_count: + description: > + This metric retrieves the count of WAL files waiting to be archived by checking the pg_wal/archive_status directory + for files with .ready extension. It helps monitor the archiving backlog and potential issues with WAL archiving. + sqls: + 10: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + count(*) as archiver_pending_count + from + (select pg_ls_dir('pg_wal/archive_status')) a + where + pg_ls_dir ~ '[0-9A-F]{24}.ready' + 9.4: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + count(*) as archiver_pending_count + from + (select pg_ls_dir('pg_xlog/archive_status')) a + where + pg_ls_dir ~ '[0-9A-F]{24}.ready' + gauges: + - archiver_pending_count + is_instance_level: true + backends: + description: > + This metric gathers detailed information from the PostgreSQL pg_stat_activity view, providing an overview of the database's current session + and activity state. It tracks the total number of client backends, active sessions, idle sessions, sessions waiting on locks, and background workers. + The metric also calculates statistics on blocked sessions, most extended waiting times, average and longest session durations, transaction times, + and query durations. Additionally, it monitors autovacuum worker activity and provides the age of the oldest transaction (measured by xmin). + This metric helps administrators monitor session states, detect bottlenecks, and ensure the system is within its connection limits, + providing visibility into database performance and contention. + sqls: + 11: | + with sa_snapshot as ( + select * from pg_stat_activity + where pid != pg_backend_pid() + and datname = current_database() + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + (select count(*) from sa_snapshot where backend_type = 'client backend') as total, + (select count(*) from pg_stat_activity where pid != pg_backend_pid()) as instance_total, + current_setting('max_connections')::int as max_connections, + (select count(*) from sa_snapshot where backend_type = 'background worker') as background_workers, + (select count(*) from sa_snapshot where state = 'active' and backend_type = 'client backend') as active, + (select count(*) from sa_snapshot where state = 'idle' and backend_type = 'client backend') as idle, + (select count(*) from sa_snapshot where state = 'idle in transaction' and backend_type = 'client backend') as idleintransaction, + (select count(*) from sa_snapshot where wait_event_type in ('LWLock', 'Lock', 'BufferPin') and backend_type = 'client backend') as waiting, + (select coalesce(sum(case when coalesce(array_length(pg_blocking_pids(pid), 1), 0) >= 1 then 1 else 0 end), 0) from sa_snapshot where backend_type = 'client backend' and state = 'active') as blocked, + (select ceil(extract(epoch from max(now() - query_start)))::int from sa_snapshot where wait_event_type in ('LWLock', 'Lock', 'BufferPin') and backend_type = 'client backend') as longest_waiting_seconds, + (select round(avg(abs(extract(epoch from now() - query_start)))::numeric, 3)::float from sa_snapshot where wait_event_type in ('LWLock', 'Lock', 'BufferPin') and backend_type = 'client backend') as avg_waiting_seconds, + (select ceil(extract(epoch from (now() - backend_start)))::int from sa_snapshot where backend_type = 'client backend' order by backend_start limit 1) as longest_session_seconds, + (select round(avg(abs(extract(epoch from now() - backend_start)))::numeric, 3)::float from sa_snapshot where backend_type = 'client backend') as avg_session_seconds, + (select ceil(extract(epoch from (now() - xact_start)))::int from sa_snapshot where xact_start is not null and backend_type = 'client backend' order by xact_start limit 1) as longest_tx_seconds, + (select round(avg(abs(extract(epoch from now() - xact_start)))::numeric, 3)::float from sa_snapshot where xact_start is not null and backend_type = 'client backend') as avg_tx_seconds, + (select ceil(extract(epoch from (now() - xact_start)))::int from sa_snapshot where backend_type = 'autovacuum worker' order by xact_start limit 1) as longest_autovacuum_seconds, + (select ceil(extract(epoch from max(now() - query_start)))::int from sa_snapshot where state = 'active' and backend_type = 'client backend') as longest_query_seconds, + (select round(avg(abs(extract(epoch from now() - query_start)))::numeric, 3)::float from sa_snapshot where state = 'active' and backend_type = 'client backend') as avg_query_seconds, + (select max(age(backend_xmin))::int8 from sa_snapshot) as max_xmin_age_tx, + (select count(*) from sa_snapshot where state = 'active' and backend_type = 'autovacuum worker') as av_workers + gauges: + - '*' + backup_age_pgbackrest: + description: > + This metric retrieves the age of the last successful pgBackRest backup in seconds. It uses the `pgbackrest --output=json info` command to fetch + the backup information and calculates the age based on the current time and the timestamp of the last backup. The metric returns a retcode of 0 + on success, along with the age in seconds and a message indicating the status. + Expects pgBackRest is correctly configured on monitored DB and "jq" tool is installed on the DB server. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + retcode, + backup_age_seconds, + message + from + get_backup_age_pgbackrest() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_backup_age_pgbackrest(OUT retcode int, OUT backup_age_seconds int, OUT message text) AS + $$ + import time + import json + import subprocess + + PGBACKREST_TIMEOUT = 30 + + def error(message, returncode=1): + return returncode, 1000000, 'Not OK. '+message + + pgbackrest_cmd=["pgbackrest", "--output=json", "info"] + + try: + p = subprocess.Popen(pgbackrest_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') + stdout, stderr = p.communicate(timeout=PGBACKREST_TIMEOUT) + except OSError as e: + return error('Failed to execute pgbackrest: {}'.format(e)) + except subprocess.TimeoutExpired: + p.terminate() + try: + p.wait(0.5) + except subprocess.TimeoutExpired: + p.kill() + return error('pgbackrest failed to respond in {} seconds'.format(PGBACKREST_TIMEOUT)) + + if p.returncode != 0: + return error('Failed on "pgbackrest info" call', returncode=p.returncode) + + try: + data = json.loads(stdout) + backup_age_seconds = int(time.time()) - data[0]['backup'][-1]['timestamp']['stop'] + return 0, backup_age_seconds, 'OK. Last backup age in seconds: {}'.format(backup_age_seconds) + except (json.JSONDecodeError, KeyError) : + return error('Failed to parse pgbackrest output') + $$ LANGUAGE plpython3u VOLATILE; + + ALTER FUNCTION get_backup_age_pgbackrest() SET statement_timeout TO '30s'; + + GRANT EXECUTE ON FUNCTION get_backup_age_pgbackrest() TO pgwatch; + + COMMENT ON FUNCTION get_backup_age_pgbackrest() is 'created for pgwatch'; + is_instance_level: true + backup_age_walg: + description: > + Retrieves the age of the last successful WAL-G backup in seconds. It uses the `wal-g backup-list --json` command to fetch + the backup information and calculates the age based on the current time and the timestamp of the last backup. + The metric returns a retcode of 0 on success, along with the age in seconds and a message indicating the status. + Expects .wal-g.json is correctly configured with all necessary credentials and "jq" tool is installed on the DB server. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + retcode, + backup_age_seconds, + message + from + get_backup_age_walg() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_backup_age_walg(OUT retcode int, OUT backup_age_seconds int, OUT message text) AS + $$ + import subprocess + retcode=1 + backup_age_seconds=1000000 + message='' + + # get latest wal-g backup timestamp + walg_last_backup_cmd="""wal-g backup-list --json | jq -r '.[0].time'""" + p = subprocess.run(walg_last_backup_cmd, stdout=subprocess.PIPE, encoding='utf-8', shell=True) + if p.returncode != 0: + # plpy.notice("p.stdout: " + str(p.stderr) + str(p.stderr)) + return p.returncode, backup_age_seconds, 'Not OK. Failed on wal-g backup-list call' + + # plpy.notice("last_tz: " + last_tz) + last_tz=p.stdout.rstrip('\n\r') + + # get seconds since last backup from WAL-G timestamp in format '2020-01-22T17:50:51Z' + try: + plan = plpy.prepare("SELECT extract(epoch from now() - $1::timestamptz)::int AS backup_age_seconds;", ["text"]) + rv = plpy.execute(plan, [last_tz]) + except Exception as e: + return retcode, backup_age_seconds, 'Not OK. Failed to convert WAL-G backup timestamp to seconds' + else: + backup_age_seconds = rv[0]["backup_age_seconds"] + return 0, backup_age_seconds, 'OK. Last backup age in seconds: %s' % backup_age_seconds + + $$ LANGUAGE plpython3u VOLATILE; + + /* contacting S3 could be laggy depending on location */ + ALTER FUNCTION get_backup_age_walg() SET statement_timeout TO '30s'; + + GRANT EXECUTE ON FUNCTION get_backup_age_walg() TO pgwatch; + + COMMENT ON FUNCTION get_backup_age_walg() is 'created for pgwatch'; + is_instance_level: true + bgwriter: + description: > + Retrieves key statistics from the PostgreSQL `pg_stat_bgwriter` view, providing insights into the background writer's performance. + It returns the number of timed and requested checkpoints, checkpoint write and sync times, buffer statistics, and the last reset time. + This metric helps administrators monitor the background writer's activity and its impact on database performance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + checkpoints_timed, + checkpoints_req, + checkpoint_write_time, + checkpoint_sync_time, + buffers_checkpoint, + buffers_clean, + maxwritten_clean, + buffers_backend, + buffers_backend_fsync, + buffers_alloc + from + pg_stat_bgwriter + 17: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + buffers_clean, + maxwritten_clean, + buffers_alloc, + (extract(epoch from now() - stats_reset))::int as last_reset_s + from + pg_stat_bgwriter + node_status: primary + is_instance_level: true + buffercache_by_db: + description: > + Retrieves buffer cache statistics grouped by database, providing insights into the size of buffers used by each database. + It calculates the total size of buffers in bytes for each database. + This metric helps administrators monitor buffer usage across different databases in the PostgreSQL instance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + datname as tag_database, + count(*) * (current_setting('block_size')::int8) as size_b + FROM + pg_buffercache AS b, + pg_database AS d + WHERE + d.oid = b.reldatabase + GROUP BY + datname + gauges: + - '*' + is_instance_level: true + buffercache_by_type: + description: > + Retrieves buffer cache statistics grouped by relation type, providing insights into the size of buffers used + by different relation kinds. It calculates the total size of buffers in bytes for each relation kind + (e.g., Table, Index, Toast, Materialized view). This metric helps administrators monitor buffer usage across + different relation types in the PostgreSQL instance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + CASE + WHEN relkind = 'r' THEN 'Table' -- TODO all relkinds covered? + WHEN relkind = 'i' THEN 'Index' + WHEN relkind = 't' THEN 'Toast' + WHEN relkind = 'm' THEN 'Materialized view' + ELSE 'Other' + END as tag_relkind, + count(*) * (current_setting('block_size')::int8) size_b + FROM + pg_buffercache AS b, + pg_class AS d + WHERE + d.oid = b.relfilenode + GROUP BY + relkind + gauges: + - '*' + is_instance_level: true + change_events: + description: > + The "change_events" built-in metric tracks DDL & config changes. Internally, it uses some other * + _hashes metrics that are not meant to be used independently. Such metrics should not be removed. + sqls: + 11: "" + checkpointer: + description: > + Retrieves key statistics from the PostgreSQL `pg_stat_checkpointer` view, providing insights into the checkpointer's performance. + It returns the number of timed and requested checkpoints, restart points, write and sync times, and buffer statistics. + This metric helps administrators monitor the checkpointer's activity and its impact on database performance. + sqls: + 11: "; -- covered by bgwriter" + 17: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + num_timed, + num_requested, + restartpoints_timed, + restartpoints_req, + restartpoints_done, + write_time, + sync_time, + buffers_written, + (extract(epoch from now() - stats_reset))::int as last_reset_s + from + pg_stat_checkpointer + configuration_hashes: + description: > + Retrieves configuration settings from the PostgreSQL `pg_settings` view, providing insights into the current configuration of the database. + This metric helps administrators monitor changes applied to the database configuration. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + name as tag_setting, + coalesce(reset_val, '') as value + from + pg_settings + where + name <> 'connection_ID' + cpu_load: + description: > + Retrieves the system load average for the last 1, 5, and 15 minutes using a custom PL/Python function. + This metric provides insights into the CPU load on the PostgreSQL server, helping administrators monitor system performance. + The function uses the `os.getloadavg()` method to fetch the load averages. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + round(load_1min::numeric, 2)::float as load_1min, + round(load_5min::numeric, 2)::float as load_5min, + round(load_15min::numeric, 2)::float as load_15min + from + get_load_average(); + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + CREATE OR REPLACE FUNCTION get_load_average(OUT load_1min float, OUT load_5min float, OUT load_15min float) AS + $$ + from os import getloadavg + la = getloadavg() + return [la[0], la[1], la[2]] + $$ LANGUAGE plpython3u VOLATILE; + GRANT EXECUTE ON FUNCTION get_load_average() TO pgwatch; + COMMENT ON FUNCTION get_load_average() is 'created for pgwatch'; + gauges: + - '*' + is_instance_level: true + database_conflicts: + description: > + Retrieves conflict statistics from the PostgreSQL `pg_stat_database_conflicts` view, providing insights into conflicts that have occurred + in the current database. It returns the number of conflicts related to tablespace, lock, snapshot, buffer pin, and deadlock. + This metric helps administrators monitor and diagnose issues related to database conflicts. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + confl_tablespace, + confl_lock, + confl_snapshot, + confl_bufferpin, + confl_deadlock + FROM + pg_stat_database_conflicts + WHERE + datname = current_database() + node_status: standby + datfrozenxid: + description: > + This metric tracks transaction ID and multixact ID ages to monitor wraparound risk. It retrieves the age + of the oldest datfrozenxid and datminmxid from pg_database for the current database, helping administrators + monitor and prevent transaction ID wraparound which can cause database shutdowns. + sqls: + 9.3: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + age(datfrozenxid) as datfrozenxid_age, + mxid_age(datminmxid) as datminmxid_age + from + pg_database + where + datname = current_database() + gauges: + - datfrozenxid_age + - datminmxid_age + db_size: + description: > + Retrieves the size of the current database and the size of the `pg_catalog` schema, providing insights into the storage usage of the database. + It returns the size in bytes for both the current database and the catalog schema. + This metric helps administrators monitor database size and storage consumption. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + pg_database_size(current_database()) as size_b, + (select sum(pg_total_relation_size(c.oid))::int8 + from pg_class c join pg_namespace n on n.oid = c.relnamespace + where nspname = 'pg_catalog' and relkind = 'r' + ) as catalog_size_b + gauges: + - '*' + statement_timeout_seconds: 300 + db_size_approx: + description: > + Retrieves an approximate size of the current database and the size of the `pg_catalog` schema, providing insights into the storage usage of the database. + It returns the size in bytes for both the current database and the catalog schema. + This metric helps administrators monitor database size and storage consumption. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + current_setting('block_size')::int8 * ( + select sum(relpages) from pg_class c + join pg_namespace n on n.oid = c.relnamespace + where c.relpersistence != 't' + ) as size_b, + current_setting('block_size')::int8 * ( + select sum(c.relpages + coalesce(ct.relpages, 0) + coalesce(cti.relpages, 0)) + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + left join pg_class ct on ct.oid = c.reltoastrelid + left join pg_index ti on ti.indrelid = ct.oid + left join pg_class cti on cti.oid = ti.indexrelid + where nspname = 'pg_catalog' + and (c.relkind = 'r' + or c.relkind = 'i' and not c.relname ~ '^pg_toast') + ) as catalog_size_b + gauges: + - '*' + metric_storage_name: db_size + db_stats: + description: > + Retrieves key statistics from the PostgreSQL `pg_stat_database` view, providing insights into the current database's performance. + It returns the number of backends, transaction commits and rollbacks, buffer reads and hits, tuple statistics, conflicts, temporary files and bytes, + deadlocks, block read and write times, postmaster uptime, backup duration, recovery status, system identifier, and invalid indexes. + This metric helps administrators monitor database activity and performance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + blk_read_time, + blk_write_time, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + extract(epoch from (now() - pg_backup_start_time()))::int8 as backup_duration_s, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + system_identifier::text as tag_sys_id, + (select count(*) from pg_index i + where not indisvalid + and not exists ( /* leave out ones that are being actively rebuilt */ + select * from pg_locks l + join pg_stat_activity a using (pid) + where l.relation = i.indexrelid + and a.state = 'active' + and a.query ~* 'concurrently' + )) as invalid_indexes + from + pg_stat_database, pg_control_system() + where + datname = current_database() + 12: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + blk_read_time, + blk_write_time, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + extract(epoch from (now() - pg_backup_start_time()))::int8 as backup_duration_s, + checksum_failures, + extract(epoch from (now() - checksum_last_failure))::int8 as checksum_last_failure_s, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + system_identifier::text as tag_sys_id, + (select count(*) from pg_index i + where not indisvalid + and not exists ( /* leave out ones that are being actively rebuilt */ + select * from pg_locks l + join pg_stat_activity a using (pid) + where l.relation = i.indexrelid + and a.state = 'active' + and a.query ~* 'concurrently' + )) as invalid_indexes + from + pg_stat_database, pg_control_system() + where + datname = current_database() + 14: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + blk_read_time, + blk_write_time, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + extract(epoch from (now() - pg_backup_start_time()))::int8 as backup_duration_s, + checksum_failures, + extract(epoch from (now() - checksum_last_failure))::int8 as checksum_last_failure_s, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + system_identifier::text as tag_sys_id, + session_time::int8, + active_time::int8, + idle_in_transaction_time::int8, + sessions, + sessions_abandoned, + sessions_fatal, + sessions_killed, + (select count(*) from pg_index i + where not indisvalid + and not exists ( /* leave out ones that are being actively rebuilt */ + select * from pg_locks l + join pg_stat_activity a using (pid) + where l.relation = i.indexrelid + and a.state = 'active' + and a.query ~* 'concurrently' + )) as invalid_indexes + from + pg_stat_database, pg_control_system() + where + datname = current_database() + 15: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + blk_read_time, + blk_write_time, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + checksum_failures, + extract(epoch from (now() - checksum_last_failure))::int8 as checksum_last_failure_s, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + system_identifier::text as tag_sys_id, + session_time::int8, + active_time::int8, + idle_in_transaction_time::int8, + sessions, + sessions_abandoned, + sessions_fatal, + sessions_killed, + (select count(*) from pg_index i + where not indisvalid + and not exists ( /* leave out ones that are being actively rebuilt */ + select * from pg_locks l + join pg_stat_activity a using (pid) + where l.relation = i.indexrelid + and a.state = 'active' + and a.query ~* 'concurrently' + )) as invalid_indexes + from + pg_stat_database, pg_control_system() + where + datname = current_database() + gauges: + - numbackends + - postmaster_uptime_s + - backup_duration_s + - backup_duration_s + - checksum_last_failure_s + db_stats_aurora: + description: > + Retrieves key statistics from the PostgreSQL `pg_stat_database` view for Amazon Aurora PostgreSQL, providing insights into the current database's performance. + It returns the number of backends, transaction commits and rollbacks, buffer reads and hits, tuple statistics, conflicts, temporary files and bytes, + deadlocks, block read and write times, postmaster uptime, recovery status, system identifier, and invalid indexes. + This metric helps administrators monitor database activity and performance in an Aurora PostgreSQL environment. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + blk_read_time, + blk_write_time, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + system_identifier::text as tag_sys_id + from + pg_stat_database, pg_control_system() + where + datname = current_database() + gauges: + - numbackends + - postmaster_uptime_s + - backup_duration_s + - checksum_last_failure_s + metric_storage_name: db_stats + index_hashes: + description: > + Retrieves the hash of index definitions in the PostgreSQL database, providing a way to track changes in index definitions over time. + This metric helps administrators monitor index changes and ensure consistency in index definitions. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + quote_ident(nspname)||'.'||quote_ident(c.relname) as tag_index, + quote_ident(nspname)||'.'||quote_ident(r.relname) as "table", + i.indisvalid::text as is_valid, + coalesce(md5(pg_get_indexdef(i.indexrelid)), random()::text) as md5 + from + pg_index i + join + pg_class c on c.oid = i.indexrelid + join + pg_class r on r.oid = i.indrelid + join + pg_namespace n on n.oid = c.relnamespace + where + c.relnamespace not in (select oid from pg_namespace where nspname like any(array[E'pg\\_%', 'information_schema'])) + index_stats: + description: > + Retrieves detailed statistics about indexes in the PostgreSQL database, including index size, scan counts, tuple read and fetch counts, + block read and hit counts, and index validity. It also identifies the largest, most scanned, and unused indexes. + This metric helps administrators monitor index performance and identify potential issues with unused or invalid indexes. + sqls: + 11: |- + /* does not return all index stats but biggest, top scanned and biggest unused ones */ + WITH q_locked_rels AS ( + select relation from pg_locks where mode = 'AccessExclusiveLock' + ), + q_index_details AS ( + select + sui.schemaname, + sui.indexrelname, + sui.relname, + sui.indexrelid, + coalesce(pg_relation_size(sui.indexrelid), 0) as index_size_b, + sui.idx_scan, + sui.idx_tup_read, + sui.idx_tup_fetch, + io.idx_blks_read, + io.idx_blks_hit, + i.indisvalid, + i.indisprimary, + i.indisunique, + i.indisexclusion + from + pg_stat_user_indexes sui + join pg_statio_user_indexes io on io.indexrelid = sui.indexrelid + join pg_index i on i.indexrelid = sui.indexrelid + where not sui.schemaname like any (array [E'pg\\_temp%', E'\\_timescaledb%']) + and not exists (select * from q_locked_rels where relation = sui.relid or relation = sui.indexrelid) + ), + q_top_indexes AS ( + /* biggest */ + select * + from ( + select indexrelid + from q_index_details + where idx_scan > 1 + order by index_size_b desc + limit 200 + ) x + union + /* most block traffic */ + select * + from ( + select indexrelid + from q_index_details + order by coalesce(idx_blks_read, 0) + coalesce(idx_blks_hit, 0) desc + limit 200 + ) y + union + /* most scans */ + select * + from ( + select indexrelid + from q_index_details + order by idx_scan desc nulls last + limit 200 + ) z + union + /* biggest unused non-constraint */ + select * + from ( + select q.indexrelid + from q_index_details q + where idx_scan = 0 + and not (indisprimary or indisunique or indisexclusion) + order by index_size_b desc + limit 200 + ) z + union + /* all invalid */ + select * + from ( + select q.indexrelid + from q_index_details q + where not indisvalid + ) zz + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + schemaname::text as tag_schema, + indexrelname::text as tag_index_name, + quote_ident(schemaname)||'.'||quote_ident(indexrelname) as tag_index_full_name, + relname::text as tag_table_name, + quote_ident(schemaname)||'.'||quote_ident(relname) as tag_table_full_name, + coalesce(idx_scan, 0) as idx_scan, + coalesce(idx_tup_read, 0) as idx_tup_read, + coalesce(idx_tup_fetch, 0) as idx_tup_fetch, + coalesce(index_size_b, 0) as index_size_b, + quote_ident(schemaname)||'.'||quote_ident(indexrelname) as index_full_name_val, + md5(regexp_replace(regexp_replace(pg_get_indexdef(indexrelid),indexrelname,'X'), '^CREATE UNIQUE','CREATE')) as tag_index_def_hash, + regexp_replace(regexp_replace(pg_get_indexdef(indexrelid),indexrelname,'X'), '^CREATE UNIQUE','CREATE') as index_def, + case when not indisvalid then 1 else 0 end as is_invalid_int, + case when indisprimary then 1 else 0 end as is_pk_int, + case when indisunique or indisexclusion then 1 else 0 end as is_uq_or_exc, + system_identifier::text as tag_sys_id + FROM + q_index_details id + JOIN + pg_control_system() ON true + WHERE + indexrelid IN (select indexrelid from q_top_indexes) + ORDER BY + id.schemaname, id.relname, id.indexrelname + 16: |- + /* NB! does not return all index stats but biggest, top scanned and biggest unused ones */ + WITH q_locked_rels AS ( /* pgwatch_generated */ + select relation from pg_locks where mode = 'AccessExclusiveLock' + ), + q_index_details AS ( + select + sui.schemaname, + sui.indexrelname, + sui.relname, + sui.indexrelid, + coalesce(pg_relation_size(sui.indexrelid), 0) as index_size_b, + sui.idx_scan, + sui.idx_tup_read, + sui.idx_tup_fetch, + io.idx_blks_read, + io.idx_blks_hit, + i.indisvalid, + i.indisprimary, + i.indisunique, + i.indisexclusion, + extract(epoch from now() - last_idx_scan)::int as last_idx_scan_s + from + pg_stat_user_indexes sui + join pg_statio_user_indexes io on io.indexrelid = sui.indexrelid + join pg_index i on i.indexrelid = sui.indexrelid + where not sui.schemaname like any (array [E'pg\\_temp%', E'\\_timescaledb%']) + and not exists (select * from q_locked_rels where relation = sui.relid or relation = sui.indexrelid) + ), + q_top_indexes AS ( + /* biggest */ + select * + from ( + select indexrelid + from q_index_details + where idx_scan > 1 + order by index_size_b desc + limit 200 + ) x + union + /* most block traffic */ + select * + from ( + select indexrelid + from q_index_details + order by coalesce(idx_blks_read, 0) + coalesce(idx_blks_hit, 0) desc + limit 200 + ) y + union + /* most scans */ + select * + from ( + select indexrelid + from q_index_details + order by idx_scan desc nulls last + limit 200 + ) z + union + /* biggest unused non-constraint */ + select * + from ( + select q.indexrelid + from q_index_details q + where idx_scan = 0 + and not (indisprimary or indisunique or indisexclusion) + order by index_size_b desc + limit 200 + ) z + union + /* all invalid */ + select * + from ( + select q.indexrelid + from q_index_details q + where not indisvalid + ) zz + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + schemaname::text as tag_schema, + indexrelname::text as tag_index_name, + quote_ident(schemaname)||'.'||quote_ident(indexrelname) as tag_index_full_name, + relname::text as tag_table_name, + quote_ident(schemaname)||'.'||quote_ident(relname) as tag_table_full_name, + coalesce(idx_scan, 0) as idx_scan, + coalesce(idx_tup_read, 0) as idx_tup_read, + coalesce(idx_tup_fetch, 0) as idx_tup_fetch, + coalesce(index_size_b, 0) as index_size_b, + quote_ident(schemaname)||'.'||quote_ident(indexrelname) as index_full_name_val, + md5(regexp_replace(regexp_replace(pg_get_indexdef(indexrelid),indexrelname,'X'), '^CREATE UNIQUE','CREATE')) as tag_index_def_hash, + regexp_replace(regexp_replace(pg_get_indexdef(indexrelid),indexrelname,'X'), '^CREATE UNIQUE','CREATE') as index_def, + case when not indisvalid then 1 else 0 end as is_invalid_int, + case when indisprimary then 1 else 0 end as is_pk_int, + case when indisunique or indisexclusion then 1 else 0 end as is_uq_or_exc, + system_identifier::text as tag_sys_id, + last_idx_scan_s + FROM + q_index_details id + JOIN + pg_control_system() ON true + WHERE + indexrelid IN (select indexrelid from q_top_indexes) + ORDER BY + id.schemaname, id.relname, id.indexrelname + instance_up: + description: > + This metric has some special handling attached to it - it will store a 0 value if the database is not accessible. + Thus it can be used to for example calculate some percentual "uptime" indicator. + For standard metrics there will be no data rows stored when the DB is not reachable, but for this one, + there will be a zero stored for the "is_up" column that, under normal operations, would always be 1. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 1::int as is_up + invalid_indexes: + description: > + Retrieves a list of invalid indexes in the PostgreSQL database, providing insights into indexes that are not valid. + It returns the index name, schema, and whether the index is valid or not. This metric helps administrators identify and address issues with invalid indexes. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + format('%I.%I', n.nspname , ci.relname) as tag_index_full_name, + coalesce(pg_relation_size(indexrelid), 0) as index_size_b + from + pg_index i + join pg_class ci on ci.oid = i.indexrelid + join pg_class cr on cr.oid = i.indrelid + join pg_namespace n on n.oid = ci.relnamespace + where not n.nspname like E'pg\\_temp%' + and not indisvalid + and not exists ( /* leave out ones that are being actively rebuilt */ + select * from pg_locks l + join pg_stat_activity a using (pid) + where l.relation = i.indexrelid + and a.state = 'active' + and a.query ~* 'concurrently' + ) + and not exists (select * from pg_locks where relation = indexrelid and mode = 'AccessExclusiveLock') /* can't get size then */ + order by index_size_b desc + limit 100 + kpi: + description: > + Retrieves key performance indicators (KPIs) from the PostgreSQL `pg_stat_database` view, providing insights into the current database's performance. + It returns the number of backends, active and blocked backends, oldest transaction age, transactions per second (TPS), commit and rollback counts, + buffer read and hit counts, temporary bytes, sequence scans on tables larger than 10MB, tuple statistics, stored procedure calls, + block read and write times, deadlocks, recovery status, and postmaster uptime. + This metric helps administrators monitor database activity and performance. + sqls: + 11: | + WITH q_stat_tables AS ( + SELECT * FROM pg_stat_user_tables t + JOIN pg_class c ON c.oid = t.relid + WHERE NOT schemaname LIKE E'pg\\_temp%' + AND c.relpages > (1e7 / 8) -- >10MB + ), + q_stat_activity AS ( + SELECT * FROM pg_stat_activity + WHERE datname = current_database() AND pid != pg_backend_pid() + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + case + when pg_is_in_recovery() = false then + pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::int8 + else + pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '0/0')::int8 + end as wal_location_b, + numbackends - 1 as numbackends, + (select count(*) from q_stat_activity where state in ('active', 'idle in transaction')) AS active_backends, + (select count(*) from q_stat_activity where wait_event_type in ('LWLock', 'Lock', 'BufferPin')) AS blocked_backends, + (select round(extract(epoch from now()) - extract(epoch from (select xact_start from q_stat_activity + where datid = d.datid and not query like 'autovacuum:%' order by xact_start limit 1))))::int AS kpi_oldest_tx_s, + xact_commit + xact_rollback AS tps, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + temp_bytes, + (select sum(seq_scan) from q_stat_tables)::int8 AS seq_scans_on_tbls_gt_10mb, + tup_inserted, + tup_updated, + tup_deleted, + (select sum(calls) from pg_stat_user_functions where not schemaname like any(array[E'pg\\_%', 'information_schema']))::int8 AS sproc_calls, + blk_read_time, + blk_write_time, + deadlocks, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s + FROM + pg_stat_database d + WHERE + datname = current_database() + gauges: + - numbackends + - active_backends + - blocked_backends + - kpi_oldest_tx_s + locks: + description: > + Retrieves lock statistics from the PostgreSQL `pg_locks` view, providing insights into the types and modes of locks currently held in the database. + It returns the lock type, lock mode, and the count of locks for each type and mode. This metric helps administrators monitor lock contention and performance. + sqls: + 11: |- + WITH q_locks AS ( + select + * + from + pg_locks + where + pid != pg_backend_pid() + and database = (select oid from pg_database where datname = current_database()) + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + locktypes AS tag_locktype, + coalesce((select count(*) FROM q_locks WHERE locktype = locktypes), 0) AS count + FROM + unnest('{relation, extend, page, tuple, transactionid, virtualxid, object, userlock, advisory}'::text[]) locktypes + gauges: + - '*' + locks_mode: + description: > + Retrieves lock mode statistics from the PostgreSQL `pg_locks` view, providing insights into the different lock modes currently held in the database. + It returns the lock mode and the count of locks for each mode. This metric helps administrators monitor lock contention and performance. + sqls: + 11: |- + WITH q_locks AS ( + select + * + from + pg_locks + where + pid != pg_backend_pid() + and database = (select oid from pg_database where datname = current_database()) + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + lockmodes AS tag_lockmode, + coalesce((select count(*) FROM q_locks WHERE mode = lockmodes), 0) AS count + FROM + unnest('{AccessShareLock, ExclusiveLock, RowShareLock, RowExclusiveLock, ShareLock, ShareRowExclusiveLock, AccessExclusiveLock, ShareUpdateExclusiveLock}'::text[]) lockmodes + gauges: + - '*' + logical_subscriptions: + description: > + Retrieves information about logical subscriptions in the PostgreSQL database, including their names, enabled status, and the number of relations in each subscription. + It also provides counts of relations in different states (inserted, deleted, synchronized, and replicated). + This metric helps administrators monitor logical replication subscriptions and their statuses. + sqls: + 11: | + with q_sr as ( + select * from pg_subscription_rel + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + subname::text as tag_subname, + subenabled, + (select count(*) from q_sr where srsubid = oid) as relcount, + (select count(*) from q_sr where srsubid = oid and srsubstate = 'i') as state_i, + (select count(*) from q_sr where srsubid = oid and srsubstate = 'd') as state_d, + (select count(*) from q_sr where srsubid = oid and srsubstate = 's') as state_s, + (select count(*) from q_sr where srsubid = oid and srsubstate = 'r') as state_r + from + pg_subscription + where + subdbid = (select oid from pg_database where datname = current_database()) + gauges: + - '*' + pgbouncer_stats: + description: > + Retrieves statistics from the PgBouncer connection pooler. + This metric helps administrators monitor PgBouncer performance and connection pooling efficiency. + sqls: + 0: show stats + pgbouncer_clients: + description: > + Retrieves client connection statistics from the PgBouncer connection pooler, providing insights into the current state of client connections. + It returns the number of active, idle, and total client connections, as well as transaction counts and memory usage statistics. + This metric helps administrators monitor PgBouncer client connections and performance. + sqls: + 0: show clients + pgpool_processes: + description: > + Retrieves process statistics from the PgPool connection pooler, providing insights into the current state of PgPool processes. + It returns the number of active, idle, and total processes, as well as memory usage statistics. + This metric helps administrators monitor PgPool process performance and resource utilization. + sqls: + 3: show pool_processes + pgpool_stats: + description: > + Retrieves statistics from the PgPool connection pooler, providing insights into the current state of PgPool connections and transactions. + It returns the number of active, idle, and total connections, as well as transaction counts and memory usage statistics. + This metric helps administrators monitor PgPool performance and connection pooling efficiency. + sqls: + 3: show pool_nodes + postgres_role: + description: > + This metric determines the PostgreSQL server role (primary, standby, or standalone) by checking + if the server is in recovery mode and if it has any active replication connections. It returns + an integer value: 0 = standalone, 1 = primary with replicas, 2 = standby/replica. + sqls: + 9.0: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + case pg_is_in_recovery() + when 't' then 2 + else (select case (select count(*) from pg_stat_replication where application_name != 'pg_basebackup') when '0' then 0 else 1 end) + end as in_recovery_int + gauges: + - in_recovery_int + is_instance_level: true + privilege_changes: + description: > + Retrieves information about privileges granted to roles on various database objects, including tables, functions, schemas, and databases. + It returns the object type, role name, object name, and privilege type for each privilege granted. + This metric helps administrators monitor and manage database access control and privileges. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch FROM now()) * 1e9)::int8 AS epoch_ns, + * + FROM ( + SELECT + 'table'::text AS object_type, + grantee::text AS tag_role, + quote_ident(table_schema) || '.' || quote_ident(table_name) AS tag_object, + privilege_type + FROM + information_schema.table_privileges + /* includes also VIEW-s actually */ + WHERE + NOT grantee = ANY ( + SELECT + rolname + FROM + pg_roles + WHERE + rolsuper + OR oid < 16384) + AND NOT table_schema IN ('information_schema', 'pg_catalog') + /* + union all + + select + -- quite a heavy query currently, maybe faster directly via pg_attribute + has_column_privilege? + 'column' AS object_type, + grantee::text AS tag_role, + quote_ident(table_schema) || '.' || quote_ident(table_name) AS tag_object, + privilege_type + FROM + information_schema.column_privileges cp + WHERE + NOT table_schema IN ('pg_catalog', 'information_schema') + AND NOT grantee = ANY ( + SELECT + rolname + FROM + pg_roles + WHERE + rolsuper + OR oid < 16384) + AND NOT EXISTS ( + SELECT + * + FROM + information_schema.table_privileges + WHERE + table_schema = cp.table_schema + AND table_name = cp.table_name + AND grantee = cp.grantee + AND privilege_type = cp.privilege_type) */ + UNION ALL + SELECT + 'function' AS object_type, + grantee::text AS tag_role, + quote_ident(routine_schema) || '.' || quote_ident(routine_name) AS tag_object, + privilege_type + FROM + information_schema.routine_privileges + WHERE + NOT routine_schema IN ('information_schema', 'pg_catalog') + AND NOT grantee = ANY ( + SELECT + rolname + FROM + pg_roles + WHERE + rolsuper + OR oid < 16384) + UNION ALL + SELECT + 'schema' AS object_type, + r.rolname::text AS tag_role, + quote_ident(n.nspname) AS tag_object, + p.perm AS privilege_type + FROM + pg_catalog.pg_namespace AS n + CROSS JOIN pg_catalog.pg_roles AS r + CROSS JOIN ( + VALUES ('USAGE'), + ('CREATE')) AS p (perm) + WHERE + NOT n.nspname IN ('information_schema', 'pg_catalog') + AND n.nspname NOT LIKE 'pg_%' + AND NOT r.rolsuper + AND r.oid >= 16384 + AND has_schema_privilege(r.oid, n.oid, p.perm) + UNION ALL + SELECT + 'database' AS object_type, + r.rolname::text AS role_name, + quote_ident(datname) AS tag_object, + p.perm AS permission + FROM + pg_catalog.pg_database AS d + CROSS JOIN pg_catalog.pg_roles AS r + CROSS JOIN ( + VALUES ('CREATE'), + ('CONNECT'), + ('TEMPORARY')) AS p (perm) + WHERE + d.datname = current_database() + AND NOT r.rolsuper + AND r.oid >= 16384 + AND has_database_privilege(r.oid, d.oid, p.perm) + UNION ALL + SELECT + 'superusers' AS object_type, + rolname::text AS role_name, + rolname::text AS tag_object, + 'SUPERUSER' AS permission + FROM + pg_catalog.pg_roles + WHERE + rolsuper + UNION ALL + SELECT + 'login_users' AS object_type, + rolname::text AS role_name, + rolname::text AS tag_object, + 'LOGIN' AS permission + FROM + pg_catalog.pg_roles + WHERE + rolcanlogin) y + psutil_cpu: + description: > + This metric requires the "psutil" Python package to be installed on the PostgreSQL server. + It provides CPU utilization and load averages using the "psutil" library. + "psutil" is known to behave differently depending on the used version and operating system, so if getting + errors please adjust to your needs. "psutil" documentation here: https://psutil.readthedocs.io/en/latest/ + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + round(cpu_utilization::numeric, 2)::float as cpu_utilization, + round(load_1m_norm::numeric, 2)::float as load_1m_norm, + round(load_1m::numeric, 2)::float as load_1m, + round(load_5m_norm::numeric, 2)::float as load_5m_norm, + round(load_5m::numeric, 2)::float as load_5m, + round("user"::numeric, 2)::float as "user", + round(system::numeric, 2)::float as system, + round(idle::numeric, 2)::float as idle, + round(iowait::numeric, 2)::float as iowait, + round(irqs::numeric, 2)::float as irqs, + round(other::numeric, 2)::float as other + from + get_psutil_cpu() + init_sql: | + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_psutil_cpu( + OUT cpu_utilization float8, OUT load_1m_norm float8, OUT load_1m float8, OUT load_5m_norm float8, OUT load_5m float8, + OUT "user" float8, OUT system float8, OUT idle float8, OUT iowait float8, OUT irqs float8, OUT other float8 + ) + LANGUAGE plpython3u + AS $FUNCTION$ + + from os import getloadavg + from psutil import cpu_times_percent, cpu_percent, cpu_count + from threading import Thread + + class GetCpuPercentThread(Thread): + def __init__(self, interval_seconds): + self.interval_seconds = interval_seconds + self.cpu_utilization_info = None + super(GetCpuPercentThread, self).__init__() + + def run(self): + self.cpu_utilization_info = cpu_percent(self.interval_seconds) + + t = GetCpuPercentThread(0.5) + t.start() + + ct = cpu_times_percent(0.5) + la = getloadavg() + + t.join() + + return t.cpu_utilization_info, la[0] / cpu_count(), la[0], la[1] / cpu_count(), la[1], ct.user, ct.system, ct.idle, ct.iowait, ct.irq + ct.softirq, ct.steal + ct.guest + ct.guest_nice + + $FUNCTION$; + + GRANT EXECUTE ON FUNCTION get_psutil_cpu() TO pgwatch; + COMMENT ON FUNCTION get_psutil_cpu() IS 'created for pgwatch'; + gauges: + - '*' + is_instance_level: true + psutil_disk: + description: > + This metric requires the "psutil" Python package to be installed on the PostgreSQL server. + It provides disk usage statistics using the "psutil" library. + "psutil" is known to behave differently depending on the used version and operating system, so if getting + errors please adjust to your needs. "psutil" documentation here: https://psutil.readthedocs.io/en/latest/ + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + dir_or_tablespace as tag_dir_or_tablespace, + path as tag_path, + total, used, free, percent + from + get_psutil_disk() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_psutil_disk( + OUT dir_or_tablespace text, OUT path text, OUT total float8, OUT used float8, OUT free float8, OUT percent float8 + ) + RETURNS SETOF record + LANGUAGE plpython3u + SECURITY DEFINER + AS $FUNCTION$ + + from os import stat + from os.path import join, exists + from psutil import disk_usage + ret_list = [] + + # data_directory + r = plpy.execute("select current_setting('data_directory') as dd, current_setting('log_directory') as ld, current_setting('server_version_num')::int as pgver") + dd = r[0]['dd'] + ld = r[0]['ld'] + du_dd = disk_usage(dd) + ret_list.append(['data_directory', dd, du_dd.total, du_dd.used, du_dd.free, du_dd.percent]) + + dd_stat = stat(dd) + # log_directory + if ld: + if not ld.startswith('/'): + ld_path = join(dd, ld) + else: + ld_path = ld + if exists(ld_path): + log_stat = stat(ld_path) + if log_stat.st_dev == dd_stat.st_dev: + pass # no new info, same device + else: + du = disk_usage(ld_path) + ret_list.append(['log_directory', ld_path, du.total, du.used, du.free, du.percent]) + + # WAL / XLOG directory + # plpy.notice('pg_wal' if r[0]['pgver'] >= 100000 else 'pg_xlog', r[0]['pgver']) + joined_path_wal = join(r[0]['dd'], 'pg_wal' if r[0]['pgver'] >= 100000 else 'pg_xlog') + wal_stat = stat(joined_path_wal) + if wal_stat.st_dev == dd_stat.st_dev: + pass # no new info, same device + else: + du = disk_usage(joined_path_wal) + ret_list.append(['pg_wal', joined_path_wal, du.total, du.used, du.free, du.percent]) + + # add user created tablespaces if any + sql_tablespaces = """ + select spcname as name, pg_catalog.pg_tablespace_location(oid) as location + from pg_catalog.pg_tablespace where not spcname like any(array[E'pg\\_%'])""" + for row in plpy.cursor(sql_tablespaces): + du = disk_usage(row['location']) + ret_list.append([row['name'], row['location'], du.total, du.used, du.free, du.percent]) + return ret_list + + $FUNCTION$; + + GRANT EXECUTE ON FUNCTION get_psutil_disk() TO pgwatch; + COMMENT ON FUNCTION get_psutil_disk() IS 'created for pgwatch'; + gauges: + - '*' + is_instance_level: true + psutil_disk_io_total: + description: > + This metric requires the "psutil" Python package to be installed on the PostgreSQL server. + It provides total disk I/O statistics using the "psutil" library. + "psutil" is known to behave differently depending on the used version and operating system, so if getting + errors please adjust to your needs. "psutil" documentation here: https://psutil.readthedocs.io/en/latest/ + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + read_count, + write_count, + read_bytes, + write_bytes + from + get_psutil_disk_io_total() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_psutil_disk_io_total( + OUT read_count float8, OUT write_count float8, OUT read_bytes float8, OUT write_bytes float8 + ) + LANGUAGE plpython3u + AS $FUNCTION$ + from psutil import disk_io_counters + dc = disk_io_counters(perdisk=False) + if dc: + return dc.read_count, dc.write_count, dc.read_bytes, dc.write_bytes + else: + return None, None, None, None + $FUNCTION$; + + GRANT EXECUTE ON FUNCTION get_psutil_disk_io_total() TO pgwatch; + COMMENT ON FUNCTION get_psutil_disk_io_total() IS 'created for pgwatch'; + is_instance_level: true + psutil_mem: + description: > + This metric requires the "psutil" Python package to be installed on the PostgreSQL server. + It provides memory usage statistics using the "psutil" library. + "psutil" is known to behave differently depending on the used version and operating system, so if getting + errors please adjust to your needs. "psutil" documentation here: https://psutil.readthedocs.io/en/latest/ + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + total, used, free, buff_cache, available, percent, + swap_total, swap_used, swap_free, swap_percent + from + get_psutil_mem() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; /* "plpython3u" might need changing to "plpythonu" (Python 2 everywhere for new OS-es */ + + CREATE OR REPLACE FUNCTION get_psutil_mem( + OUT total float8, OUT used float8, OUT free float8, OUT buff_cache float8, OUT available float8, OUT percent float8, + OUT swap_total float8, OUT swap_used float8, OUT swap_free float8, OUT swap_percent float8 + ) + LANGUAGE plpython3u + AS $FUNCTION$ + from psutil import virtual_memory, swap_memory + vm = virtual_memory() + sw = swap_memory() + return vm.total, vm.used, vm.free, vm.buffers + vm.cached, vm.available, vm.percent, sw.total, sw.used, sw.free, sw.percent + $FUNCTION$; + + GRANT EXECUTE ON FUNCTION get_psutil_mem() TO pgwatch; + COMMENT ON FUNCTION get_psutil_mem() IS 'created for pgwatch'; + gauges: + - '*' + is_instance_level: true + reco_add_index: + description: > + Retrieves recommendations for creating indexes based on the `pg_qualstats_index_advisor()` function. + It provides insights into potential index creation opportunities to improve query performance. + This metric helps administrators optimize database performance by suggesting index creation. + sqls: + 11: |- + select /* pgwatch_generated */ + epoch_ns, + tag_reco_topic, + tag_object_name, + recommendation, + case when exists (select * from pg_inherits + where inhrelid = regclass(tag_object_name) + ) then 'Partitioned table, create the index on parent' else extra_info + end as extra_info + FROM ( + SELECT (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'create_index'::text as tag_reco_topic, + (regexp_matches(v::text, E'ON (.*?) '))[1] as tag_object_name, + v::text as recommendation, + '' as extra_info + FROM json_array_elements( + pg_qualstats_index_advisor() -> 'indexes') v + ) x + ORDER BY tag_object_name + node_status: primary + is_private: true + reco_default_public_schema: + description: > + Retrieves recommendations for revoking the CREATE privilege on the public schema from PUBLIC. + This metric helps enhance security by ensuring that only authorized users can create new objects in the public schema. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'default_public_schema_privs'::text as tag_reco_topic, + nspname::text as tag_object_name, + 'REVOKE CREATE ON SCHEMA public FROM PUBLIC;'::text as recommendation, + 'only authorized users should be allowed to create new objects'::text as extra_info + from + pg_namespace + where + nspname = 'public' + and nspacl::text ~ E'[,\\{]+=U?C/' + node_status: primary + reco_disabled_triggers: + description: > + Retrieves recommendations for reviewing and potentially dropping disabled triggers in the PostgreSQL database. + It provides insights into triggers that are currently disabled, helping administrators identify and manage unused or unnecessary triggers. + This metric helps maintain database performance and reduce clutter by suggesting the removal of unused triggers. + sqls: + 11: | + /* "temporarily" disabled triggers might be forgotten about... */ + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'disabled_triggers'::text as tag_reco_topic, + quote_ident(nspname)||'.'||quote_ident(relname) as tag_object_name, + 'review usage of trigger and consider dropping it if not needed anymore'::text as recommendation, + ''::text as extra_info + from + pg_trigger t + join + pg_class c on c.oid = t.tgrelid + join + pg_namespace n on n.oid = c.relnamespace + where + tgenabled = 'D' + node_status: primary + reco_drop_index: + description: > + Retrieves recommendations for dropping unused or invalid indexes in the PostgreSQL database. + It provides insights into indexes that have not been scanned and are consuming a significant portion of the database size. + This metric helps administrators optimize database performance by suggesting the removal of unnecessary indexes. + sqls: + 11: | + /* assumes the pg_qualstats extension */ + with q_database_size as ( + select pg_database_size(current_database()) as database_size_b + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'drop_index'::text as tag_reco_topic, + quote_ident(schemaname)||'.'||quote_ident(indexrelname) as tag_object_name, + ('DROP INDEX ' || quote_ident(schemaname)||'.'||quote_ident(indexrelname) || ';')::text as recommendation, + 'Make sure to also check replica pg_stat_user_indexes.idx_scan count if using them for queries'::text as extra_info + from + pg_stat_user_indexes + join + pg_index using (indexrelid) + join + q_database_size on true + where + idx_scan = 0 + and ((pg_relation_size(indexrelid)::numeric / database_size_b) > 0.005 /* 0.5% DB size threshold */ + or indisvalid) + and not indisprimary + and not indisreplident + and not schemaname like '_timescaledb%' + node_status: primary + reco_nested_views: + description: > + Retrieves recommendations for overly nested views in the PostgreSQL database. + It identifies views that depend on other views and have a nesting depth greater than 3. + This metric helps administrators optimize query performance by suggesting the reduction of view nesting. + sqls: + 11: |- + WITH RECURSIVE views AS ( + -- get the directly depending views + SELECT v.oid::regclass AS view, + format('%s.%s', quote_ident(n.nspname), quote_ident(v.relname)) as full_name, + 1 AS level + FROM pg_depend AS d + JOIN pg_rewrite AS r + ON r.oid = d.objid + JOIN pg_class AS v + ON v.oid = r.ev_class + JOIN pg_namespace AS n + ON n.oid = v.relnamespace + WHERE v.relkind = 'v' + AND NOT n.nspname = ANY(array['information_schema', E'pg\\_%']) + AND NOT v.relname LIKE E'pg\\_%' + AND d.classid = 'pg_rewrite'::regclass + AND d.refclassid = 'pg_class'::regclass + AND d.deptype = 'n' + UNION ALL + -- add the views that depend on these + SELECT v.oid::regclass, + format('%s.%s', quote_ident(n.nspname), quote_ident(v.relname)) as full_name, + views.level + 1 + FROM views + JOIN pg_depend AS d + ON d.refobjid = views.view + JOIN pg_rewrite AS r + ON r.oid = d.objid + JOIN pg_class AS v + ON v.oid = r.ev_class + JOIN pg_namespace AS n + ON n.oid = v.relnamespace + WHERE v.relkind = 'v' + AND NOT n.nspname = ANY(array['information_schema', E'pg\\_%']) + AND d.classid = 'pg_rewrite'::regclass + AND d.refclassid = 'pg_class'::regclass + AND d.deptype = 'n' + AND v.oid <> views.view -- avoid loop + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'overly_nested_views'::text AS tag_reco_topic, + full_name::text as tag_object_name, + 'overly nested views can affect performance'::text recommendation, + 'nesting_depth: ' || coalesce(max(level)::text, '-') AS extra_info + FROM views + GROUP BY 1, 2, 3 + HAVING max(level) > 3 + ORDER BY max(level) DESC, full_name::text + node_status: primary + reco_partial_index_candidates: + description: > + Retrieves recommendations for creating partial indexes on columns with a high fraction of NULL values. + It identifies single-column indexes that could potentially be declared as partial indexes, leaving out NULL values. + This metric helps optimize index usage and improve query performance by suggesting the creation of partial indexes. + sqls: + 11: | + select distinct /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'partial_index_candidates'::text as tag_reco_topic, + quote_ident(ni.nspname)||'.'||quote_ident(ci.relname) as tag_object_name, + ('index ' || quote_ident(ni.nspname)||'.'||quote_ident(ci.relname) || ' on ' || quote_ident(s.schemaname) || '.' || quote_ident(s.tablename) || ' column ' || quote_ident(s.attname) || ' could possibly be declared partial leaving out NULL-s')::text as recommendation, + 'NULL fraction: ' || round((null_frac * 100)::numeric, 1) || '%, rowcount estimate: ' || (c.reltuples)::int8 || ', current definition: ' || pg_get_indexdef(i.indexrelid) as extra_info + from + pg_stats s + join pg_attribute a using (attname) + join pg_index i on i.indkey[0] = a.attnum and i.indrelid = a.attrelid + join pg_class c on c.oid = i.indrelid + join pg_class ci on ci.oid = i.indexrelid + join pg_namespace ni on ni.oid = ci.relnamespace + where + not indisprimary + and not indisunique + and indisready + and indisvalid + and i.indnatts = 1 /* simple 1 column indexes */ + and null_frac > 0.5 /* 50% empty */ + and not pg_get_indexdef(i.indexrelid) like '% WHERE %' + and c.reltuples >= 1e5 /* ignore smaller tables */ + and not exists ( /* leave out sub-partitions */ + select * from pg_inherits where inhrelid = c.oid + ) + reco_sprocs_wo_search_path: + description: > + Retrieves recommendations for stored procedures that do not have a fixed `search_path` set. + It identifies stored procedures that could potentially be abused by malicious users if used objects are not fully qualified. + This metric helps enhance security by suggesting the setting of a fixed search_path for stored procedures. + sqls: + 11: |- + with q_sprocs as ( + select /* pgwatch_generated */ + format('%s.%s', quote_ident(nspname), quote_ident(proname)) as sproc_name, + 'alter function ' || proname || '(' || pg_get_function_arguments(p.oid) || ') set search_path = X;' as fix_sql + from + pg_proc p + join pg_namespace n on n.oid = p.pronamespace + where prosecdef and not 'search_path' = ANY(coalesce(proconfig, '{}'::text[])) + and not pg_catalog.obj_description(p.oid, 'pg_proc') ~ 'pgwatch' + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'sprocs_wo_search_path'::text as tag_reco_topic, + sproc_name::text as tag_object_name, + fix_sql::text as recommendation, + 'functions without fixed search_path can be potentially abused by malicious users if used objects are not fully qualified'::text as extra_info + from + q_sprocs + order by + tag_object_name, extra_info + node_status: primary + reco_superusers: + description: > + Retrieves recommendations for reviewing the number of superusers in the PostgreSQL database. + It identifies if there are too many superusers, which can pose a security risk. + This metric helps maintain database security by suggesting a review of superuser accounts. + sqls: + 11: | + with q_su as ( + select count(*) from pg_roles where rolcanlogin and rolsuper + ), + q_total as ( + select count(*) from pg_roles where rolcanlogin + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + 'superuser_count'::text as tag_reco_topic, + '-'::text as tag_object_name, + 'too many superusers detected - review recommended'::text as recommendation, + format('%s active superusers, %s total active users', q_su.count, q_total.count) as extra_info + from + q_su, q_total + where + q_su.count >= 10 + node_status: primary + recommendations: + description: > + When enabled, this metric will find all other metrics starting with `reco_*` and execute those queries. + The metric targets performance, security, and other "best practices" violations. + Users can add new `reco_*` queries freely. + init_sql: CREATE EXTENSION IF NOT EXISTS pg_qualstats; + sqls: + 11: /* dummy placeholder - special handling in code to collect other metrics named reco_* */ + replication: + description: > + This metric collects replication statistics from the `pg_stat_replication` view. + It provides insights into the status of replication connections, including lag times and states. + This metric is useful for monitoring replication health and performance. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + application_name as tag_application_name, + usename AS tag_usename, + concat(coalesce(client_addr::text, client_hostname), '_', client_port::text) as tag_client_info, + coalesce(pg_wal_lsn_diff(case when pg_is_in_recovery() then pg_last_wal_receive_lsn() else pg_current_wal_lsn() end, sent_lsn)::int8, 0) as sent_lag_b, + coalesce(pg_wal_lsn_diff(case when pg_is_in_recovery() then pg_last_wal_receive_lsn() else pg_current_wal_lsn() end, write_lsn)::int8, 0) as write_lag_b, + coalesce(pg_wal_lsn_diff(case when pg_is_in_recovery() then pg_last_wal_receive_lsn() else pg_current_wal_lsn() end, flush_lsn)::int8, 0) as flush_lag_b, + coalesce(pg_wal_lsn_diff(case when pg_is_in_recovery() then pg_last_wal_receive_lsn() else pg_current_wal_lsn() end, replay_lsn)::int8, 0) as replay_lag_b, + (extract(epoch from write_lag) * 1000)::int8 as write_lag_ms, + (extract(epoch from flush_lag) * 1000)::int8 as flush_lag_ms, + (extract(epoch from replay_lag) * 1000)::int8 as replay_lag_ms, + state, + sync_state, + case when sync_state in ('sync', 'quorum') then 1 else 0 end as is_sync_int, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int + from + pg_stat_replication + where + coalesce(application_name, '') not in ('pg_basebackup', 'pg_rewind'); + gauges: + - '*' + is_instance_level: true + replication_slot_stats: + description: > + This metric collects statistics from the `pg_stat_replication_slots` view. + It provides insights into the status of replication slots, including transaction counts and byte usage. + This metric is useful for monitoring replication slot health and performance. + sqls: + 14: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + slot_name::text as tag_slot_name, + spill_txns, + spill_count, + spill_bytes, + stream_txns, + stream_count, + stream_bytes, + total_txns, + total_bytes + from + pg_stat_replication_slots + replication_slots: + description: > + This metric collects information about replication slots from the `pg_replication_slots` view. + It provides insights into the status of replication slots, including their activity and lag times. + This metric is useful for monitoring replication slot health and performance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + slot_name::text as tag_slot_name, + coalesce(plugin, 'physical')::text as tag_plugin, + active, + case when active then 0 else 1 end as non_active_int, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::int8 as restart_lsn_lag_b, + greatest(age(xmin), age(catalog_xmin))::int8 as xmin_age_tx + from + pg_replication_slots + node_status: primary + gauges: + - '*' + is_instance_level: true + sequence_health: + description: > + This metric collects health statistics for sequences in the PostgreSQL database. + It provides insights into the usage and status of sequences, including maximum usage percentages and counts of sequences that are heavily used. + This metric is useful for monitoring sequence health and performance. + sqls: + 11: |- + with q_seq_data as ( + select * from pg_sequences + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + (select round(100.0 * coalesce(max(last_value::numeric / max_value), 0), 2)::float from q_seq_data where not cycle) as max_used_pct, + (select count(*) from q_seq_data where not cycle and last_value::numeric / max_value > 0.5) as p50_used_seq_count, + (select count(*) from q_seq_data where not cycle and last_value::numeric / max_value > 0.75) as p75_used_seq_count + server_log_event_counts: + description: > + This metric enables the Postgres server log "tailing" for errors. It can't be used for remote setups, though, + unless the DB logs are somehow mounted or copied over, as real file access is needed! + sqls: + 11: |- + /* + Dummy placeholder - special handling in gatherer code for log parsing + */ + settings: + description: > + This metric collects various PostgreSQL server settings and configurations. + It provides insights into the server's configuration, including version, memory settings, and other important parameters. + This metric is useful for monitoring server settings and ensuring optimal performance. + sqls: + 11: | + with qs as ( + select name, setting from pg_settings + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + current_setting('server_version') as server_version, + current_setting('server_version_num')::int8 as server_version_num, + (regexp_matches(regexp_replace(current_setting('server_version'), '(beta|devel).*', '', 'g'), E'\\d+'))[1]::float8 as major_version, + current_setting('block_size')::int as block_size, + current_setting('max_connections')::int as max_connections, + current_setting('hot_standby') as hot_standby, + (select setting from qs where name = 'hot_standby_feedback') as hot_standby_feedback, + current_setting('fsync') as fsync, + current_setting('full_page_writes') as full_page_writes, + current_setting('synchronous_commit') as synchronous_commit, + (select setting from qs where name = 'wal_compression') as wal_compression, + (select setting from qs where name = 'wal_log_hints') as wal_log_hints, + (select setting from qs where name = 'synchronous_standby_names') as synchronous_standby_names, + current_setting('shared_buffers') as shared_buffers, + current_setting('work_mem') as work_mem, + current_setting('maintenance_work_mem') as maintenance_work_mem, + current_setting('effective_cache_size') as effective_cache_size, + (select setting::int8 from qs where name = 'default_statistics_target') as default_statistics_target, + (select setting::float8 from qs where name = 'random_page_cost') as random_page_cost, + pg_size_pretty(((select setting::int8 from qs where name = 'min_wal_size') * 1024^2)::int8) as min_wal_size, + pg_size_pretty(((select setting::int8 from qs where name = 'max_wal_size') * 1024^2)::int8) as max_wal_size, + (select setting from qs where name = 'checkpoint_segments') as checkpoint_segments, + current_setting('checkpoint_timeout') as checkpoint_timeout, + current_setting('checkpoint_completion_target') as checkpoint_completion_target, + (select setting::int8 from qs where name = 'max_worker_processes') as max_worker_processes, + (select setting::int8 from qs where name = 'max_parallel_workers') as max_parallel_workers, + (select setting::int8 from qs where name = 'max_parallel_workers_per_gather') as max_parallel_workers_per_gather, + (select case when setting = 'on' then 1 else 0 end from qs where name = 'jit') as jit, + (select case when setting = 'on' then 1 else 0 end from qs where name = 'ssl') as ssl, + current_setting('statement_timeout') as statement_timeout, + current_setting('deadlock_timeout') as deadlock_timeout, + (select setting from qs where name = 'data_checksums') as data_checksums, + (select setting::int8 from qs where name = 'max_connections') as max_connections, + (select setting::int8 from qs where name = 'max_wal_senders') as max_wal_senders, + (select setting::int8 from qs where name = 'max_replication_slots') as max_replication_slots, + (select setting::int8 from qs where name = 'max_prepared_transactions') as max_prepared_transactions, + (select setting::int8 from qs where name = 'lock_timeout') || ' (ms)' as lock_timeout, + (select setting from qs where name = 'archive_mode') as archive_mode, + (select setting from qs where name = 'archive_command') as archive_command, + current_setting('archive_timeout') as archive_timeout, + (select setting from qs where name = 'shared_preload_libraries') as shared_preload_libraries, + (select setting from qs where name = 'listen_addresses') as listen_addresses, + (select setting from qs where name = 'ssl') as ssl, + (select setting from qs where name = 'autovacuum') as autovacuum, + (select setting::int8 from qs where name = 'autovacuum_max_workers') as autovacuum_max_workers, + (select setting::float8 from qs where name = 'autovacuum_vacuum_scale_factor') as autovacuum_vacuum_scale_factor, + (select setting::float8 from qs where name = 'autovacuum_vacuum_threshold') as autovacuum_vacuum_threshold, + (select setting::float8 from qs where name = 'autovacuum_analyze_scale_factor') as autovacuum_analyze_scale_factor, + (select setting::float8 from qs where name = 'autovacuum_analyze_threshold') as autovacuum_analyze_scale_factor + show_plans_realtime: + description: > + This metric collects real-time query plans from the `pg_show_plans` extension. + It provides insights into the execution plans of currently running queries, helping to identify performance issues and optimize query execution. + This metric is useful for monitoring query performance and understanding how queries are executed in real-time. + sqls: + 11: | + /* assumes pg_show_plans extension */ + select /* pgwatch_generated */ + max((extract(epoch from now()) * 1e9)::int8) as epoch_ns, + max(extract(epoch from now() - query_start))::int as max_s, + avg(extract(epoch from now() - query_start))::int as avg_s, + count(*), + array_to_string(array_agg(distinct usename order by usename), ',') as "users", + max(md5(plan)) as tag_hash, /* needed for influx */ + plan, + max(query) as query + from + pg_show_plans p + join + pg_stat_activity a + using (pid) + where + p.pid != pg_backend_pid() + and datname = current_database() + and now() - query_start > '1s'::interval + and backend_type = 'client backend' + group by + plan + order by + max_s desc + limit + 10 + smart_health_per_disk: + description: > + This metric collects SMART health status for all disk devices using the `smartmontools` utility. + It provides insights into the health of disk devices, including their SMART status and return codes. + This metric is useful for monitoring disk health and identifying potential issues with disk devices. + This helper is always meant to be tested and adjusted to make sure all disk are detected. + Most likely smartctl privileges must be escalated to give postgres access: `sudo chmod u+s /usr/local/sbin/smartctl` + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + device as tag_device, + retcode + from + get_smart_health_per_device() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_smart_health_per_device(OUT device text, OUT retcode int) RETURNS SETOF record AS + $$ + import subprocess + ret_list = [] + + #disk_detect_cmd='smartctl --scan | cut -d " " -f3 | grep mega' # for Lenovo ServerRAID M1210 + disk_detect_cmd='lsblk -io KNAME,TYPE | grep '' disk'' | cut -d " " -f1 | sort' + p = subprocess.run(disk_detect_cmd, stdout=subprocess.PIPE, encoding='utf-8', shell=True) + if p.returncode != 0: + return ret_list + disks = p.stdout.splitlines() + + for disk in disks: + # health_cmd = 'smartctl -d $disk -a -q silent /dev/sda' % disk # for Lenovo ServerRAID M1210 members + health_cmd = 'smartctl -a -q silent /dev/%s' % disk + p = subprocess.run(health_cmd, stdout=subprocess.PIPE, encoding='utf-8', shell=True) + ret_list.append((disk, p.returncode)) + + return ret_list + + $$ LANGUAGE plpython3u VOLATILE; + + GRANT EXECUTE ON FUNCTION get_smart_health_per_device() TO pgwatch; + + COMMENT ON FUNCTION get_smart_health_per_device() is 'created for pgwatch'; + sproc_hashes: + description: > + This metric collects hashes of all stored procedures in the database. + It provides a way to track changes in stored procedures over time by comparing their hashes. + This metric is useful for monitoring stored procedure integrity and detecting changes. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + p.oid::text as tag_oid, + quote_ident(nspname)||'.'||quote_ident(proname) as tag_sproc, + md5(prosrc) + from + pg_proc p + join + pg_namespace n on n.oid = pronamespace + where + not nspname like any(array[E'pg\\_%', 'information_schema']) + sproc_stats: + description: > + This metric collects statistics about user-defined functions (stored procedures) in the database. + It provides insights into function usage, including call counts and execution times. + This metric is useful for monitoring function performance and identifying potential bottlenecks. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + schemaname::text AS tag_schema, + funcname::text AS tag_function_name, + quote_ident(schemaname)||'.'||quote_ident(funcname) as tag_function_full_name, + p.oid::text as tag_oid, -- for overloaded funcs + calls as sp_calls, + self_time, + total_time + FROM + pg_stat_user_functions f + JOIN + pg_proc p ON p.oid = f.funcid + ORDER BY + total_time DESC + LIMIT + 300 + stat_activity: + description: > + This metric collects statistics about currently active queries in the database. + It provides insights into the state of active queries, including their duration and blocking status. + This metric is useful for monitoring query performance and identifying long-running or blocked queries. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + s.query as query, + count(*) as count + from pg_stat_activity s + where s.datname = current_database() + and s.state = 'active' + and s.backend_type = 'client backend' + and s.pid != pg_backend_pid() + and now() - s.query_start > '100ms'::interval + group by s.query + stat_activity_realtime: + description: > + This metric collects real-time statistics about currently active queries in the database. + It provides insights into the state of active queries, including their duration and blocking status. + This metric is useful for monitoring query performance and identifying long-running or blocked queries in real-time. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + pid as tag_pid, + usename::text AS user, + application_name AS appname, + coalesce(client_addr::text, 'local') AS ip, + extract(epoch FROM (now() - query_start))::int AS duration_s, + (coalesce(wait_event_type, '') IN ('LWLockNamed', 'Lock', 'BufferPin'))::int AS waiting, + array_to_string(pg_blocking_pids(pid), ',') as blocking_pids, + ltrim(regexp_replace(query, E'[ \\t\\n\\r]+' , ' ', 'g'))::varchar(300) AS query + FROM + pg_stat_activity + WHERE + state != 'idle' + AND backend_type IN ('client backend', 'autovacuum worker') + AND pid != pg_backend_pid() + AND datname = current_database() + AND now() - query_start > '500ms'::interval + ORDER BY + now() - query_start DESC + LIMIT 25 + stat_io: + description: > + This metric collects I/O statistics from the `pg_stat_io` view. + It provides insights into read and write operations, including the number of reads, writes, and their associated times. + This metric is useful for monitoring I/O performance and identifying potential bottlenecks in disk operations. + sqls: + 16: |- + SELECT /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + coalesce(backend_type, 'total') as tag_backend_type, + sum(coalesce(reads, 0))::int8 as reads, + (sum(coalesce(reads, 0) * op_bytes) / 1e6)::int8 as read_bytes_mb, + sum(coalesce(read_time, 0))::int8 as read_time_ms, + sum(coalesce(writes, 0))::int8 as writes, + (sum(coalesce(writes, 0) * op_bytes) / 1e6)::int8 as write_bytes_mb, + sum(coalesce(write_time, 0))::int8 as write_time_ms, + sum(coalesce(writebacks, 0))::int8 as writebacks, + (sum(coalesce(writebacks, 0) * op_bytes) / 1e6)::int8 as writeback_bytes_mb, + sum(coalesce(writeback_time, 0))::int8 as writeback_time_ms, + sum(coalesce(fsyncs, 0))::int8 fsyncs, + sum(coalesce(fsync_time, 0))::int8 fsync_time_ms, + max(extract(epoch from now() - stats_reset)::int) as stats_reset_s + FROM + pg_stat_io + GROUP BY + ROLLUP (backend_type) + is_instance_level: true + stat_ssl: + description: > + This metric collects SSL connection statistics from the `pg_stat_ssl` view. + It provides insights into the number of SSL connections, including those that are encrypted and those that are not. + This metric is useful for monitoring SSL usage and ensuring secure connections in the PostgreSQL database. + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + count(*) as total, + count(*) FILTER (WHERE ssl) as "on", + count(*) FILTER (WHERE NOT ssl) as "off" + FROM + pg_stat_ssl AS s, + pg_stat_activity AS a + WHERE + a.pid = s.pid + AND a.datname = current_database() + AND a.pid <> pg_backend_pid() + AND NOT (a.client_addr = '127.0.0.1' OR client_port = -1) + gauges: + - '*' + stat_statements: + description: > + This metric collects statistics from the `pg_stat_statements` extension. + It provides insights into query performance, including execution times, block reads/writes, and user information. + This metric is useful for monitoring query performance and identifying slow or resource-intensive queries. + init_sql: CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + sqls: + 11: |- + WITH q_data AS ( + SELECT + coalesce(queryid::text, 'insufficient-privileges-total') as tag_queryid, + /* + if security conscious about exposing query texts replace the below expression with a dash ('-') OR + use the stat_statements_no_query_text metric instead, created specifically for this use case. + */ + array_to_string(array_agg(DISTINCT quote_ident(pg_get_userbyid(userid))), ',') AS users, + sum(s.calls)::int8 AS calls, + round(sum(s.total_time)::numeric, 3)::double precision AS total_time, + sum(shared_blks_hit)::int8 AS shared_blks_hit, + sum(shared_blks_read)::int8 AS shared_blks_read, + sum(shared_blks_written)::int8 AS shared_blks_written, + sum(shared_blks_dirtied)::int8 AS shared_blks_dirtied, + sum(temp_blks_read)::int8 AS temp_blks_read, + sum(temp_blks_written)::int8 AS temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision AS blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision AS blk_write_time, + max(query::varchar(8000)) AS query + FROM + pg_stat_statements s + WHERE + calls > 5 + AND total_time > 5 + AND dbid = ( + SELECT + oid + FROM + pg_database + WHERE + datname = current_database()) + AND NOT upper(s.query::varchar(50)) + LIKE ANY (ARRAY['DEALLOCATE%', + 'SET %', + 'RESET %', + 'BEGIN%', + 'BEGIN;', + 'COMMIT%', + 'END%', + 'ROLLBACK%', + 'SHOW%']) + GROUP BY + queryid + ) + SELECT (EXTRACT(epoch FROM now()) * 1e9)::int8 AS epoch_ns, + b.tag_queryid, + b.users, + b.calls, + b.total_time, + b.shared_blks_hit, + b.shared_blks_read, + b.shared_blks_written, + b.shared_blks_dirtied, + b.temp_blks_read, + b.temp_blks_written, + b.blk_read_time, + b.blk_write_time, + ltrim(regexp_replace(b.query, E'[ \\t\\n\\r]+', ' ', 'g')) tag_query + FROM ( + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + total_time > 0 + ORDER BY + total_time DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + ORDER BY + calls DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_read > 0 + ORDER BY + shared_blks_read DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_written > 0 + ORDER BY + shared_blks_written DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_read > 0 + ORDER BY + temp_blks_read DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_written > 0 + ORDER BY + temp_blks_written DESC + LIMIT 100) a) b + 13: |- + WITH q_data AS ( + SELECT + coalesce(queryid::text, 'insufficient-privileges-total') as tag_queryid, + /* + if security conscious about exposing query texts replace the below expression with a dash ('-') OR + use the stat_statements_no_query_text metric instead, created specifically for this use case. + */ + array_to_string(array_agg(DISTINCT quote_ident(pg_get_userbyid(userid))), ',') AS users, + sum(s.calls)::int8 AS calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision AS total_time, + sum(shared_blks_hit)::int8 AS shared_blks_hit, + sum(shared_blks_read)::int8 AS shared_blks_read, + sum(shared_blks_written)::int8 AS shared_blks_written, + sum(shared_blks_dirtied)::int8 AS shared_blks_dirtied, + sum(temp_blks_read)::int8 AS temp_blks_read, + sum(temp_blks_written)::int8 AS temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision AS blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision AS blk_write_time, + sum(wal_fpi)::int8 AS wal_fpi, + sum(wal_bytes)::int8 AS wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision AS total_plan_time, + max(query::varchar(8000)) AS query + FROM + pg_stat_statements s + WHERE + calls > 5 + AND total_exec_time > 5 + AND dbid = ( + SELECT + oid + FROM + pg_database + WHERE + datname = current_database()) + AND NOT upper(s.query::varchar(50)) + LIKE ANY (ARRAY['DEALLOCATE%', + 'SET %', + 'RESET %', + 'BEGIN%', + 'BEGIN;', + 'COMMIT%', + 'END%', + 'ROLLBACK%', + 'SHOW%']) + GROUP BY + queryid + ) + select /* pgwatch_generated */ + (EXTRACT(epoch FROM now()) * 1e9)::int8 AS epoch_ns, + b.tag_queryid, + b.users, + b.calls, + b.total_time, + b.shared_blks_hit, + b.shared_blks_read, + b.shared_blks_written, + b.shared_blks_dirtied, + b.temp_blks_read, + b.temp_blks_written, + b.blk_read_time, + b.blk_write_time, + b.wal_fpi, + b.wal_bytes, + b.total_plan_time, + ltrim(regexp_replace(b.query, E'[ \\t\\n\\r]+', ' ', 'g')) AS tag_query + FROM ( + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + total_time > 0 + ORDER BY + total_time DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + ORDER BY + calls DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_read > 0 + ORDER BY + shared_blks_read DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_written > 0 + ORDER BY + shared_blks_written DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_read > 0 + ORDER BY + temp_blks_read DESC + LIMIT 100) a + UNION + select /* pgwatch_generated */ + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_written > 0 + ORDER BY + temp_blks_written DESC + LIMIT 100) a) b + 15: |- + WITH /* pgwatch_generated */ q_data AS ( + SELECT + queryid::text AS tag_queryid, + /* + if security conscious about exposing query texts replace the below expression with a dash ('-') OR + use the stat_statements_no_query_text metric instead, created specifically for this use case. + */ + array_to_string(array_agg(DISTINCT quote_ident(pg_get_userbyid(userid))), ',') AS users, + sum(s.calls)::int8 AS calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision AS total_time, + sum(shared_blks_hit)::int8 AS shared_blks_hit, + sum(shared_blks_read)::int8 AS shared_blks_read, + sum(shared_blks_written)::int8 AS shared_blks_written, + sum(shared_blks_dirtied)::int8 AS shared_blks_dirtied, + sum(temp_blks_read)::int8 AS temp_blks_read, + sum(temp_blks_written)::int8 AS temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision AS blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision AS blk_write_time, + round(sum(temp_blk_read_time)::numeric, 3)::double precision AS temp_blk_read_time, + round(sum(temp_blk_write_time)::numeric, 3)::double precision AS temp_blk_write_time, + sum(wal_fpi)::int8 AS wal_fpi, + sum(wal_bytes)::int8 AS wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision AS total_plan_time, + max(query::varchar(8000)) AS query + FROM + pg_stat_statements s + WHERE + calls > 5 + AND total_exec_time > 5 + AND dbid = ( + SELECT + oid + FROM + pg_database + WHERE + datname = current_database()) + AND NOT upper(s.query::varchar(50)) + LIKE ANY (ARRAY['DEALLOCATE%', + 'SET %', + 'RESET %', + 'BEGIN%', + 'BEGIN;', + 'COMMIT%', + 'END%', + 'ROLLBACK%', + 'SHOW%']) + GROUP BY + queryid + ) + SELECT + (EXTRACT(epoch FROM now()) * 1e9)::int8 AS epoch_ns, + b.tag_queryid, + b.users, + b.calls, + b.total_time, + b.shared_blks_hit, + b.shared_blks_read, + b.shared_blks_written, + b.shared_blks_dirtied, + b.temp_blks_read, + b.temp_blks_written, + b.blk_read_time, + b.blk_write_time, + b.temp_blk_read_time, + b.temp_blk_write_time, + b.wal_fpi, + b.wal_bytes, + b.total_plan_time, + ltrim(regexp_replace(b.query, E'[ \\t\\n\\r]+', ' ', 'g')) AS tag_query + FROM ( + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + total_time > 0 + ORDER BY + total_time DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + ORDER BY + calls DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_read > 0 + ORDER BY + shared_blks_read DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_written > 0 + ORDER BY + shared_blks_written DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_read > 0 + ORDER BY + temp_blks_read DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_written > 0 + ORDER BY + temp_blks_written DESC + LIMIT 100) a) b + 17: |- + WITH /* pgwatch_generated */ q_data AS ( + SELECT + queryid::text AS tag_queryid, + /* + NB! if security conscious about exposing query texts replace the below expression with a dash ('-') OR + use the stat_statements_no_query_text metric instead, created specifically for this use case. + */ + array_to_string(array_agg(DISTINCT quote_ident(pg_get_userbyid(userid))), ',') AS users, + sum(s.calls)::int8 AS calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision AS total_time, + sum(shared_blks_hit)::int8 AS shared_blks_hit, + sum(shared_blks_read)::int8 AS shared_blks_read, + sum(shared_blks_written)::int8 AS shared_blks_written, + sum(shared_blks_dirtied)::int8 AS shared_blks_dirtied, + sum(temp_blks_read)::int8 AS temp_blks_read, + sum(temp_blks_written)::int8 AS temp_blks_written, + round((sum(shared_blk_read_time) + sum(local_blk_read_time))::numeric, 3)::double precision AS blk_read_time, + round((sum(shared_blk_write_time) + sum(local_blk_write_time))::numeric, 3)::double precision AS blk_write_time, + round(sum(temp_blk_read_time)::numeric, 3)::double precision AS temp_blk_read_time, + round(sum(temp_blk_write_time)::numeric, 3)::double precision AS temp_blk_write_time, + sum(wal_fpi)::int8 AS wal_fpi, + sum(wal_bytes)::int8 AS wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision AS total_plan_time, + max(query::varchar(8000)) AS query + FROM + pg_stat_statements s + WHERE + calls > 5 + AND total_exec_time > 5 + AND dbid = ( + SELECT + oid + FROM + pg_database + WHERE + datname = current_database()) + AND NOT upper(s.query::varchar(50)) + LIKE ANY (ARRAY['DEALLOCATE%', + 'SET %', + 'RESET %', + 'BEGIN%', + 'BEGIN;', + 'COMMIT%', + 'END%', + 'ROLLBACK%', + 'SHOW%']) + GROUP BY + queryid + ) + SELECT + (EXTRACT(epoch FROM now()) * 1e9)::int8 AS epoch_ns, + b.tag_queryid, + b.users, + b.calls, + b.total_time, + b.shared_blks_hit, + b.shared_blks_read, + b.shared_blks_written, + b.shared_blks_dirtied, + b.temp_blks_read, + b.temp_blks_written, + b.blk_read_time, + b.blk_write_time, + b.temp_blk_read_time, + b.temp_blk_write_time, + b.wal_fpi, + b.wal_bytes, + b.total_plan_time, + ltrim(regexp_replace(b.query, E'[ \\t\\n\\r]+', ' ', 'g')) AS tag_query + FROM ( + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + total_time > 0 + ORDER BY + total_time DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + ORDER BY + calls DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_read > 0 + ORDER BY + shared_blks_read DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + shared_blks_written > 0 + ORDER BY + shared_blks_written DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_read > 0 + ORDER BY + temp_blks_read DESC + LIMIT 100) a + UNION + SELECT + * + FROM ( + SELECT + * + FROM + q_data + WHERE + temp_blks_written > 0 + ORDER BY + temp_blks_written DESC + LIMIT 100) a) b; + stat_statements_calls: + description: > + This metric collects statistics from the `pg_stat_statements` extension, focusing on the number of calls and total execution time. + It provides insights into query performance, including execution times and call counts. + This metric is useful for monitoring query performance and identifying slow or resource-intensive queries. + init_sql: CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + sqls: + 11: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + coalesce(sum(calls), 0)::int8 as calls, + coalesce(round(sum(total_time)::numeric, 3), 0)::float8 as total_time + from + pg_stat_statements + where + dbid = (select oid from pg_database where datname = current_database()) + 13: | + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + coalesce(sum(calls), 0)::int8 as calls, + coalesce(round(sum(total_exec_time)::numeric, 3), 0)::float8 as total_time, + round(sum(total_plan_time)::numeric, 3)::double precision as total_plan_time + from + pg_stat_statements + where + dbid = (select oid from pg_database where datname = current_database()) + stat_statements_no_query_text: + description: > + This metric collects statistics from the `pg_stat_statements` extension without including the query text. + It provides insights into query performance, including execution times, block reads/writes, and user information, + while omitting the actual query text for security or privacy reasons. + This metric is useful for monitoring query performance without exposing sensitive query details. + init_sql: CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + sqls: + 11: |- + with q_data as ( + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + '-'::text as tag_query, + coalesce(queryid::text, 'insufficient-privileges-total') as tag_queryid, + array_to_string(array_agg(distinct quote_ident(pg_get_userbyid(userid))), ',') as users, + sum(s.calls)::int8 as calls, + round(sum(s.total_time)::numeric, 3)::double precision as total_time, + sum(shared_blks_hit)::int8 as shared_blks_hit, + sum(shared_blks_read)::int8 as shared_blks_read, + sum(shared_blks_written)::int8 as shared_blks_written, + sum(shared_blks_dirtied)::int8 as shared_blks_dirtied, + sum(temp_blks_read)::int8 as temp_blks_read, + sum(temp_blks_written)::int8 as temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision as blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision as blk_write_time + from + pg_stat_statements s + where + calls > 5 + and total_time > 0 + and dbid = (select oid from pg_database where datname = current_database()) + and not upper(s.query) like any (array['DEALLOCATE%', 'SET %', 'RESET %', 'BEGIN%', 'BEGIN;', + 'COMMIT%', 'END%', 'ROLLBACK%', 'SHOW%']) + group by + queryid + ) + select * from ( + select + * + from + q_data + where + total_time > 0 + order by + total_time desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + order by + calls desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_read > 0 + order by + shared_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_written > 0 + order by + shared_blks_written desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_read > 0 + order by + temp_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_written > 0 + order by + temp_blks_written desc + limit 100 + ) a + 13: |- + with q_data as ( + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + '-' as tag_query, + coalesce(queryid::text, 'insufficient-privileges-total') as tag_queryid, + array_to_string(array_agg(distinct quote_ident(pg_get_userbyid(userid))), ',') as users, + sum(s.calls)::int8 as calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision as total_time, + sum(shared_blks_hit)::int8 as shared_blks_hit, + sum(shared_blks_read)::int8 as shared_blks_read, + sum(shared_blks_written)::int8 as shared_blks_written, + sum(shared_blks_dirtied)::int8 as shared_blks_dirtied, + sum(temp_blks_read)::int8 as temp_blks_read, + sum(temp_blks_written)::int8 as temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision as blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision as blk_write_time, + sum(wal_fpi)::int8 as wal_fpi, + sum(wal_bytes)::int8 as wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision as total_plan_time + from + pg_stat_statements s + where + calls > 5 + and total_exec_time > 0 + and dbid = (select oid from pg_database where datname = current_database()) + and not upper(s.query) like any (array['DEALLOCATE%', 'SET %', 'RESET %', 'BEGIN%', 'BEGIN;', + 'COMMIT%', 'END%', 'ROLLBACK%', 'SHOW%']) + group by + queryid + ) + select * from ( + select + * + from + q_data + where + total_time > 0 + order by + total_time desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + order by + calls desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_read > 0 + order by + shared_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_written > 0 + order by + shared_blks_written desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_read > 0 + order by + temp_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_written > 0 + order by + temp_blks_written desc + limit 100 + ) a + 15: |- + with /* pgwatch_generated */ q_data as ( + select + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + '-' as tag_query, + queryid::text as tag_queryid, + array_to_string(array_agg(distinct quote_ident(pg_get_userbyid(userid))), ',') as users, + sum(s.calls)::int8 as calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision as total_time, + sum(shared_blks_hit)::int8 as shared_blks_hit, + sum(shared_blks_read)::int8 as shared_blks_read, + sum(shared_blks_written)::int8 as shared_blks_written, + sum(shared_blks_dirtied)::int8 as shared_blks_dirtied, + sum(temp_blks_read)::int8 as temp_blks_read, + sum(temp_blks_written)::int8 as temp_blks_written, + round(sum(blk_read_time)::numeric, 3)::double precision as blk_read_time, + round(sum(blk_write_time)::numeric, 3)::double precision as blk_write_time, + round(sum(temp_blk_read_time)::numeric, 3)::double precision as temp_blk_read_time, + round(sum(temp_blk_write_time)::numeric, 3)::double precision as temp_blk_write_time, + sum(wal_fpi) as wal_fpi, + sum(wal_bytes) as wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision as total_plan_time + from + pg_stat_statements s + where + calls > 5 + and total_exec_time > 0 + and dbid = (select oid from pg_database where datname = current_database()) + and not upper(s.query) like any (array['DEALLOCATE%', 'SET %', 'RESET %', 'BEGIN%', 'BEGIN;', + 'COMMIT%', 'END%', 'ROLLBACK%', 'SHOW%']) + group by + queryid + ) + select * from ( + select + * + from + q_data + where + total_time > 0 + order by + total_time desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + order by + calls desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_read > 0 + order by + shared_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_written > 0 + order by + shared_blks_written desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_read > 0 + order by + temp_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_written > 0 + order by + temp_blks_written desc + limit 100 + ) a + 17: |- + with /* pgwatch_generated */ q_data as ( + select + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + '-' as tag_query, + queryid::text as tag_queryid, + array_to_string(array_agg(distinct quote_ident(pg_get_userbyid(userid))), ',') as users, + sum(s.calls)::int8 as calls, + round(sum(s.total_exec_time)::numeric, 3)::double precision as total_time, + sum(shared_blks_hit)::int8 as shared_blks_hit, + sum(shared_blks_read)::int8 as shared_blks_read, + sum(shared_blks_written)::int8 as shared_blks_written, + sum(shared_blks_dirtied)::int8 as shared_blks_dirtied, + sum(temp_blks_read)::int8 as temp_blks_read, + sum(temp_blks_written)::int8 as temp_blks_written, + round((sum(shared_blk_read_time) + sum(local_blk_read_time))::numeric, 3)::double precision AS blk_read_time, + round((sum(shared_blk_write_time) + sum(local_blk_write_time))::numeric, 3)::double precision AS blk_write_time, + round(sum(temp_blk_read_time)::numeric, 3)::double precision as temp_blk_read_time, + round(sum(temp_blk_write_time)::numeric, 3)::double precision as temp_blk_write_time, + sum(wal_fpi)::int8 as wal_fpi, + sum(wal_bytes)::int8 as wal_bytes, + round(sum(s.total_plan_time)::numeric, 3)::double precision as total_plan_time + from + pg_stat_statements s + where + calls > 5 + and total_exec_time > 0 + and dbid = (select oid from pg_database where datname = current_database()) + and not upper(s.query) like any (array['DEALLOCATE%', 'SET %', 'RESET %', 'BEGIN%', 'BEGIN;', + 'COMMIT%', 'END%', 'ROLLBACK%', 'SHOW%']) + group by + queryid + ) + select * from ( + select + * + from + q_data + where + total_time > 0 + order by + total_time desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + order by + calls desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_read > 0 + order by + shared_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + shared_blks_written > 0 + order by + shared_blks_written desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_read > 0 + order by + temp_blks_read desc + limit 100 + ) a + union + select * from ( + select + * + from + q_data + where + temp_blks_written > 0 + order by + temp_blks_written desc + limit 100 + ) a; + metric_storage_name: stat_statements + subscription_stats: + description: > + This metric collects statistics from the `pg_stat_subscription_stats` view, which provides information about the status of logical replication subscriptions. + It includes details such as the number of apply and sync errors, which can help in monitoring the health of logical replication. + sqls: + 15: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + subname::text as tag_subname, + apply_error_count, + sync_error_count + from + pg_stat_subscription_stats + table_bloat_approx_stattuple: + description: > + This metric collects approximate table bloat statistics using the `pgstattuple_approx` function. + It provides insights into the amount of free space and dead tuples in tables, which can help in identifying bloat issues. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + quote_ident(n.nspname)||'.'||quote_ident(c.relname) as tag_full_table_name, + approx_free_percent, + approx_free_space as approx_free_space_b, + approx_tuple_count, + dead_tuple_percent, + dead_tuple_len as dead_tuple_len_b + from + pg_class c + join lateral pgstattuple_approx(c.oid) st on (c.oid not in (select relation from pg_locks where mode = 'AccessExclusiveLock')) -- skip locked tables, + join pg_namespace n on n.oid = c.relnamespace + where + relkind in ('r', 'm') + and c.relpages >= 128 -- tables > 1mb + and not n.nspname like any (array[E'pg\\_%', 'information_schema']) + node_status: primary + gauges: + - '*' + table_bloat_approx_summary: + description: > + This metric provides a summary of approximate table bloat statistics, including the total bloat size and percentage for the current database. + It aggregates data from multiple tables to give an overview of bloat across the database. + sqls: + 11: |- + /* accessing pgstattuple_approx directly requires superuser or pg_stat_scan_tables/pg_monitor builtin roles or + execute grant on pgstattuple_approx(regclass) + */ + with table_bloat_approx as ( + select + avg(approx_free_percent)::double precision as approx_free_percent, + sum(approx_free_space)::double precision as approx_free_space, + avg(dead_tuple_percent)::double precision as dead_tuple_percent, + sum(dead_tuple_len)::double precision as dead_tuple_len + from + pg_class c + join + pg_namespace n on n.oid = c.relnamespace + join lateral pgstattuple_approx(c.oid) on (c.oid not in (select relation from pg_locks where mode = 'AccessExclusiveLock')) -- skip locked tables + where + relkind in ('r', 'm') + and c.relpages >= 128 -- tables >1mb + and not n.nspname != 'information_schema' + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + approx_free_percent, + approx_free_space as approx_free_space_b, + dead_tuple_percent, + dead_tuple_len as dead_tuple_len_b + from + table_bloat_approx + where + approx_free_space > 0 + gauges: + - '*' + table_bloat_approx_summary_sql: + description: > + This metric provides a summary of approximate table bloat statistics, including the total bloat size and percentage for the current database. + It aggregates data from multiple tables to give an overview of bloat across the database. + sqls: + 11: | + WITH q_bloat AS ( + SELECT + quote_ident(schemaname)||'.'||quote_ident(tblname) as full_table_name, + bloat_ratio as approx_bloat_percent, + bloat_size as approx_bloat_bytes, + fillfactor + FROM ( + + /* WARNING: executed with a non-superuser role, the query inspect only tables you are granted to read. + * This query is compatible with PostgreSQL 9.0 and more + */ + SELECT current_database(), + schemaname, + tblname, + bs * tblpages AS real_size, + (tblpages - est_tblpages) * bs AS extra_size, + CASE + WHEN tblpages - est_tblpages > 0 + THEN 100 * (tblpages - est_tblpages) / tblpages::float + ELSE 0 + END AS extra_ratio, + fillfactor, + CASE + WHEN tblpages - est_tblpages_ff > 0 + THEN (tblpages - est_tblpages_ff) * bs + ELSE 0 + END AS bloat_size, + CASE + WHEN tblpages - est_tblpages_ff > 0 + THEN 100 * (tblpages - est_tblpages_ff) / tblpages::float + ELSE 0 + END AS bloat_ratio, + is_na + -- , (pst).free_percent + (pst).dead_tuple_percent AS real_frag + FROM ( + SELECT ceil(reltuples / ((bs - page_hdr) / tpl_size)) + ceil(toasttuples / 4) AS est_tblpages, + ceil(reltuples / ((bs - page_hdr) * fillfactor / (tpl_size * 100))) + + ceil(toasttuples / 4) AS est_tblpages_ff, + tblpages, + fillfactor, + bs, + tblid, + schemaname, + tblname, + heappages, + toastpages, + is_na + -- , stattuple.pgstattuple(tblid) AS pst + FROM ( + SELECT (4 + tpl_hdr_size + tpl_data_size + (2 * ma) + - CASE WHEN tpl_hdr_size % ma = 0 THEN ma ELSE tpl_hdr_size % ma END + - CASE + WHEN ceil(tpl_data_size)::int % ma = 0 THEN ma + ELSE ceil(tpl_data_size)::int % ma END + ) AS tpl_size, + bs - page_hdr AS size_per_block, + (heappages + toastpages) AS tblpages, + heappages, + toastpages, + reltuples, + toasttuples, + bs, + page_hdr, + tblid, + schemaname, + tblname, + fillfactor, + is_na + FROM ( + SELECT tbl.oid AS tblid, + ns.nspname AS schemaname, + tbl.relname AS tblname, + tbl.reltuples, + tbl.relpages AS heappages, + coalesce(toast.relpages, 0) AS toastpages, + coalesce(toast.reltuples, 0) AS toasttuples, + coalesce(substring( + array_to_string(tbl.reloptions, ' ') + FROM 'fillfactor=([0-9]+)')::smallint, + 100) AS fillfactor, + current_setting('block_size')::numeric AS bs, + CASE + WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' + THEN 8 + ELSE 4 END AS ma, + 24 AS page_hdr, + 23 + CASE + WHEN MAX(coalesce(null_frac, 0)) > 0 THEN (7 + count(*)) / 8 + ELSE 0::int END + + + CASE WHEN tbl.relhasoids THEN 4 ELSE 0 END AS tpl_hdr_size, + sum((1 - coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS tpl_data_size, + bool_or(att.atttypid = 'pg_catalog.name'::regtype) + OR count(att.attname) <> count(s.attname) AS is_na + FROM pg_attribute AS att + JOIN pg_class AS tbl ON att.attrelid = tbl.oid + JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace + LEFT JOIN pg_stats AS s ON s.schemaname = ns.nspname + AND s.tablename = tbl.relname AND s.inherited = false AND + s.attname = att.attname + LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid + WHERE att.attnum > 0 + AND NOT att.attisdropped + AND tbl.relkind IN ('r', 'm') + AND ns.nspname != 'information_schema' + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, tbl.relhasoids + ORDER BY 2, 3 + ) AS s + ) AS s2 + ) AS s3 + -- WHERE NOT is_na + ) s4 + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + (select sum(approx_bloat_bytes) from q_bloat) as approx_table_bloat_b, + ((select sum(approx_bloat_bytes) from q_bloat) * 100 / pg_database_size(current_database()))::int8 as approx_bloat_percentage + 12: | + WITH q_bloat AS ( + SELECT quote_ident(schemaname) || '.' || quote_ident(tblname) as full_table_name, + bloat_ratio as approx_bloat_percent, + bloat_size as approx_bloat_bytes, + fillfactor + FROM ( + + /* WARNING: executed with a non-superuser role, the query inspect only tables you are granted to read. + * This query is compatible with PostgreSQL 9.0 and more + */ + SELECT current_database(), + schemaname, + tblname, + bs * tblpages AS real_size, + (tblpages - est_tblpages) * bs AS extra_size, + CASE + WHEN tblpages > 0 AND tblpages - est_tblpages > 0 + THEN 100 * (tblpages - est_tblpages) / tblpages::float + ELSE 0 + END AS extra_ratio, + fillfactor, + CASE + WHEN tblpages - est_tblpages_ff > 0 + THEN (tblpages - est_tblpages_ff) * bs + ELSE 0 + END AS bloat_size, + CASE + WHEN tblpages > 0 AND tblpages - est_tblpages_ff > 0 + THEN 100 * (tblpages - est_tblpages_ff) / tblpages::float + ELSE 0 + END AS bloat_ratio, + is_na + -- , (pst).free_percent + (pst).dead_tuple_percent AS real_frag + FROM ( + SELECT ceil(reltuples / ((bs - page_hdr) / tpl_size)) + ceil(toasttuples / 4) AS est_tblpages, + ceil(reltuples / ((bs - page_hdr) * fillfactor / (tpl_size * 100))) + + ceil(toasttuples / 4) AS est_tblpages_ff, + tblpages, + fillfactor, + bs, + tblid, + schemaname, + tblname, + heappages, + toastpages, + is_na + -- , stattuple.pgstattuple(tblid) AS pst + FROM ( + SELECT (4 + tpl_hdr_size + tpl_data_size + (2 * ma) + - CASE WHEN tpl_hdr_size % ma = 0 THEN ma ELSE tpl_hdr_size % ma END + - CASE + WHEN ceil(tpl_data_size)::int % ma = 0 THEN ma + ELSE ceil(tpl_data_size)::int % ma END + ) AS tpl_size, + bs - page_hdr AS size_per_block, + (heappages + toastpages) AS tblpages, + heappages, + toastpages, + reltuples, + toasttuples, + bs, + page_hdr, + tblid, + schemaname, + tblname, + fillfactor, + is_na + FROM ( + SELECT tbl.oid AS tblid, + ns.nspname AS schemaname, + tbl.relname AS tblname, + tbl.reltuples, + tbl.relpages AS heappages, + coalesce(toast.relpages, 0) AS toastpages, + coalesce(toast.reltuples, 0) AS toasttuples, + coalesce(substring( + array_to_string(tbl.reloptions, ' ') + FROM 'fillfactor=([0-9]+)')::smallint, + 100) AS fillfactor, + current_setting('block_size')::numeric AS bs, + CASE + WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' + THEN 8 + ELSE 4 END AS ma, + 24 AS page_hdr, + 23 + CASE + WHEN MAX(coalesce(null_frac, 0)) > 0 THEN (7 + count(*)) / 8 + ELSE 0::int END + + + 0 AS tpl_hdr_size, + sum((1 - coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS tpl_data_size, + bool_or(att.atttypid = 'pg_catalog.name'::regtype) + OR + count(att.attname) <> count(s.attname) AS is_na + FROM pg_attribute AS att + JOIN pg_class AS tbl ON att.attrelid = tbl.oid + JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace + LEFT JOIN pg_stats AS s ON s.schemaname = ns.nspname + AND s.tablename = tbl.relname AND s.inherited = false AND + s.attname = att.attname + LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid + WHERE att.attnum > 0 + AND NOT att.attisdropped + AND tbl.relkind IN ('r', 'm') + AND ns.nspname != 'information_schema' + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ORDER BY 2, 3 + ) AS s + ) AS s2 + ) AS s3 + -- WHERE NOT is_na + ) s4 + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + (select sum(approx_bloat_bytes) from q_bloat) as approx_table_bloat_b, + ((select sum(approx_bloat_bytes) from q_bloat) * 100 / pg_database_size(current_database()))::int8 as approx_bloat_percentage + gauges: + - '*' + table_hashes: + description: > + This metric collects hashes of table definitions to detect changes in the schema. + It uses the `pg_catalog.pg_tables` view to gather information about tables and their columns. + The hash is computed based on the table schema, name, and column definitions. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + quote_ident(table_schema)||'.'||quote_ident(table_name) as tag_table, + md5((array_agg((c.*)::text order by ordinal_position))::text) + from ( + SELECT current_database()::information_schema.sql_identifier AS table_catalog, + nc.nspname::information_schema.sql_identifier AS table_schema, + c.relname::information_schema.sql_identifier AS table_name, + a.attname::information_schema.sql_identifier AS column_name, + a.attnum::information_schema.cardinal_number AS ordinal_position, + pg_get_expr(ad.adbin, ad.adrelid)::information_schema.character_data AS column_default, + CASE + WHEN a.attnotnull OR t.typtype = 'd'::"char" AND t.typnotnull THEN 'NO'::text + ELSE 'YES'::text + END::information_schema.yes_or_no AS is_nullable, + CASE + WHEN t.typtype = 'd'::"char" THEN + CASE + WHEN bt.typelem <> 0::oid AND bt.typlen = '-1'::integer THEN 'ARRAY'::text + WHEN nbt.nspname = 'pg_catalog'::name THEN format_type(t.typbasetype, NULL::integer) + ELSE 'USER-DEFINED'::text + END + ELSE + CASE + WHEN t.typelem <> 0::oid AND t.typlen = '-1'::integer THEN 'ARRAY'::text + WHEN nt.nspname = 'pg_catalog'::name THEN format_type(a.atttypid, NULL::integer) + ELSE 'USER-DEFINED'::text + END + END::information_schema.character_data AS data_type, + information_schema._pg_char_max_length(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS character_maximum_length, + information_schema._pg_char_octet_length(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS character_octet_length, + information_schema._pg_numeric_precision(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS numeric_precision, + information_schema._pg_numeric_precision_radix(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS numeric_precision_radix, + information_schema._pg_numeric_scale(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS numeric_scale, + information_schema._pg_datetime_precision(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.cardinal_number AS datetime_precision, + information_schema._pg_interval_type(information_schema._pg_truetypid(a.*, t.*), information_schema._pg_truetypmod(a.*, t.*))::information_schema.character_data AS interval_type, + NULL::integer::information_schema.cardinal_number AS interval_precision, + NULL::character varying::information_schema.sql_identifier AS character_set_catalog, + NULL::character varying::information_schema.sql_identifier AS character_set_schema, + NULL::character varying::information_schema.sql_identifier AS character_set_name, + CASE + WHEN nco.nspname IS NOT NULL THEN current_database() + ELSE NULL::name + END::information_schema.sql_identifier AS collation_catalog, + nco.nspname::information_schema.sql_identifier AS collation_schema, + co.collname::information_schema.sql_identifier AS collation_name, + CASE + WHEN t.typtype = 'd'::"char" THEN current_database() + ELSE NULL::name + END::information_schema.sql_identifier AS domain_catalog, + CASE + WHEN t.typtype = 'd'::"char" THEN nt.nspname + ELSE NULL::name + END::information_schema.sql_identifier AS domain_schema, + CASE + WHEN t.typtype = 'd'::"char" THEN t.typname + ELSE NULL::name + END::information_schema.sql_identifier AS domain_name, + current_database()::information_schema.sql_identifier AS udt_catalog, + COALESCE(nbt.nspname, nt.nspname)::information_schema.sql_identifier AS udt_schema, + COALESCE(bt.typname, t.typname)::information_schema.sql_identifier AS udt_name, + NULL::character varying::information_schema.sql_identifier AS scope_catalog, + NULL::character varying::information_schema.sql_identifier AS scope_schema, + NULL::character varying::information_schema.sql_identifier AS scope_name, + NULL::integer::information_schema.cardinal_number AS maximum_cardinality, + a.attnum::information_schema.sql_identifier AS dtd_identifier, + 'NO'::character varying::information_schema.yes_or_no AS is_self_referencing, + 'NO'::character varying::information_schema.yes_or_no AS is_identity, + NULL::character varying::information_schema.character_data AS identity_generation, + NULL::character varying::information_schema.character_data AS identity_start, + NULL::character varying::information_schema.character_data AS identity_increment, + NULL::character varying::information_schema.character_data AS identity_maximum, + NULL::character varying::information_schema.character_data AS identity_minimum, + NULL::character varying::information_schema.yes_or_no AS identity_cycle, + 'NEVER'::character varying::information_schema.character_data AS is_generated, + NULL::character varying::information_schema.character_data AS generation_expression, + CASE + WHEN c.relkind = 'r'::"char" OR (c.relkind = ANY (ARRAY['v'::"char", 'f'::"char"])) AND pg_column_is_updatable(c.oid::regclass, a.attnum, false) THEN 'YES'::text + ELSE 'NO'::text + END::information_schema.yes_or_no AS is_updatable + FROM pg_attribute a + LEFT JOIN pg_attrdef ad ON a.attrelid = ad.adrelid AND a.attnum = ad.adnum + JOIN (pg_class c + JOIN pg_namespace nc ON c.relnamespace = nc.oid) ON a.attrelid = c.oid + JOIN (pg_type t + JOIN pg_namespace nt ON t.typnamespace = nt.oid) ON a.atttypid = t.oid + LEFT JOIN (pg_type bt + JOIN pg_namespace nbt ON bt.typnamespace = nbt.oid) ON t.typtype = 'd'::"char" AND t.typbasetype = bt.oid + LEFT JOIN (pg_collation co + JOIN pg_namespace nco ON co.collnamespace = nco.oid) ON a.attcollation = co.oid AND (nco.nspname <> 'pg_catalog'::name OR co.collname <> 'default'::name) + WHERE NOT pg_is_other_temp_schema(nc.oid) AND a.attnum > 0 AND NOT a.attisdropped AND (c.relkind = ANY (ARRAY['r'::"char", 'v'::"char", 'f'::"char"])) + + ) c + where + not table_schema like any (array[E'pg\\_%', 'information_schema']) + group by + table_schema, table_name + order by + table_schema, table_name + table_io_stats: + description: > + This metric collects I/O statistics for tables, including heap and index block reads and hits. + It provides insights into the performance of table access patterns. + sqls: + 11: |- + select * from ( + with recursive + q_root_part as ( + select c.oid, + c.relkind, + n.nspname root_schema, + c.relname root_relname + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + where relkind in ('p', 'r') + and relpersistence != 't' + and not n.nspname like any (array[E'pg\\_%', 'information_schema', E'\\_timescaledb%']) + and not exists(select * from pg_inherits where inhrelid = c.oid) + and exists(select * from pg_inherits where inhparent = c.oid) + ), + q_parts (relid, relkind, level, root) as ( + select oid, relkind, 1, oid + from q_root_part + union all + select inhrelid, c.relkind, level + 1, q.root + from pg_inherits i + join q_parts q on inhparent = q.relid + join pg_class c on c.oid = i.inhrelid + ), + q_tstats as ( + SELECT (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + relid, + schemaname::text as tag_schema, + relname::text as tag_table_name, + quote_ident(schemaname) || '.' || quote_ident(relname) as tag_table_full_name, + heap_blks_read, + heap_blks_hit, + idx_blks_read, + idx_blks_hit, + toast_blks_read, + toast_blks_hit, + tidx_blks_read, + tidx_blks_hit + FROM pg_statio_user_tables + WHERE NOT schemaname LIKE E'pg\\_temp%' + AND (heap_blks_read > 0 OR heap_blks_hit > 0 OR idx_blks_read > 0 OR idx_blks_hit > 0 OR + tidx_blks_read > 0 OR + tidx_blks_hit > 0) + ) + select epoch_ns, + tag_schema, + tag_table_name, + tag_table_full_name, + 0 as is_part_root, + heap_blks_read, + heap_blks_hit, + idx_blks_read, + idx_blks_hit, + toast_blks_read, + toast_blks_hit, + tidx_blks_read, + tidx_blks_hit + from q_tstats + where not tag_schema like E'\\_timescaledb%' + and not exists (select * from q_root_part where oid = q_tstats.relid) + + union all + + select * + from ( + select epoch_ns, + quote_ident(qr.root_schema) as tag_schema, + quote_ident(qr.root_relname) as tag_table_name, + quote_ident(qr.root_schema) || '.' || quote_ident(qr.root_relname) as tag_table_full_name, + 1 as is_part_root, + sum(heap_blks_read)::int8, + sum(heap_blks_hit)::int8, + sum(idx_blks_read)::int8, + sum(idx_blks_hit)::int8, + sum(toast_blks_read)::int8, + sum(toast_blks_hit)::int8, + sum(tidx_blks_read)::int8, + sum(tidx_blks_hit)::int8 + from q_tstats ts + join q_parts qp on qp.relid = ts.relid + join q_root_part qr on qr.oid = qp.root + group by 1, 2, 3, 4 + ) x + ) y + order by + coalesce(heap_blks_read, 0) + + coalesce(heap_blks_hit, 0) + + coalesce(idx_blks_read, 0) + + coalesce(idx_blks_hit, 0) + + coalesce(toast_blks_read, 0) + + coalesce(toast_blks_hit, 0) + + coalesce(tidx_blks_read, 0) + + coalesce(tidx_blks_hit, 0) + desc limit 300 + table_stats: + description: > + This metric collects statistics about user tables, including size, vacuum status, and transaction freeze age. + It provides insights into the health and performance of tables in the database. + sqls: + 11: |- + with recursive + q_root_part as ( + select c.oid, + c.relkind, + n.nspname root_schema, + c.relname root_relname + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + where relkind in ('p', 'r') + and relpersistence != 't' + and not n.nspname like any (array[E'pg\\_%', 'information_schema', E'\\_timescaledb%']) + and not exists(select * from pg_inherits where inhrelid = c.oid) + and exists(select * from pg_inherits where inhparent = c.oid) + ), + q_parts (relid, relkind, level, root) as ( + select oid, relkind, 1, oid + from q_root_part + union all + select inhrelid, c.relkind, level + 1, q.root + from pg_inherits i + join q_parts q on inhparent = q.relid + join pg_class c on c.oid = i.inhrelid + ), + q_tstats as ( + select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + relid, -- not sent to final output + quote_ident(schemaname) as tag_schema, + quote_ident(ut.relname) as tag_table_name, + quote_ident(schemaname) || '.' || quote_ident(ut.relname) as tag_table_full_name, + pg_table_size(relid) as table_size_b, + abs(greatest(ceil(log((pg_table_size(relid) + 1) / 10 ^ 6)), 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + pg_total_relation_size(relid) as total_relation_size_b, + case when reltoastrelid != 0 then pg_total_relation_size(reltoastrelid) else 0::int8 end as toast_size_b, + (extract(epoch from now() - greatest(last_vacuum, last_autovacuum)))::int8 as seconds_since_last_vacuum, + (extract(epoch from now() - greatest(last_analyze, last_autoanalyze)))::int8 as seconds_since_last_analyze, + case when 'autovacuum_enabled=off' = ANY (c.reloptions) then 1 else 0 end as no_autovacuum, + seq_scan, + seq_tup_read, + coalesce(idx_scan, 0) as idx_scan, + coalesce(idx_tup_fetch, 0) as idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + case when c.relkind != 'p' then age(c.relfrozenxid) else 0 end as tx_freeze_age + from pg_stat_user_tables ut + join + pg_class c on c.oid = ut.relid + where + -- leaving out fully locked tables as pg_relation_size also wants a lock and would wait + not exists(select 1 from pg_locks where relation = relid and mode = 'AccessExclusiveLock') + and c.relpersistence != 't' -- and temp tables + ) + + select /* pgwatch_generated */ + epoch_ns, + tag_schema, + tag_table_name, + tag_table_full_name, + 0 as is_part_root, + table_size_b, + tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + total_relation_size_b, + toast_size_b, + seconds_since_last_vacuum, + seconds_since_last_analyze, + no_autovacuum, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + tx_freeze_age + from q_tstats + where not tag_schema like E'\\_timescaledb%' + and not exists (select * from q_root_part where oid = q_tstats.relid) + + union all + + select * from ( + select + epoch_ns, + quote_ident(qr.root_schema) as tag_schema, + quote_ident(qr.root_relname) as tag_table_name, + quote_ident(qr.root_schema) || '.' || quote_ident(qr.root_relname) as tag_table_full_name, + 1 as is_part_root, + sum(table_size_b)::int8 table_size_b, + abs(greatest(ceil(log((sum(table_size_b) + 1) / 10 ^ 6)), + 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + sum(total_relation_size_b)::int8 total_relation_size_b, + sum(toast_size_b)::int8 toast_size_b, + min(seconds_since_last_vacuum)::int8 seconds_since_last_vacuum, + min(seconds_since_last_analyze)::int8 seconds_since_last_analyze, + sum(no_autovacuum)::int8 no_autovacuum, + sum(seq_scan)::int8 seq_scan, + sum(seq_tup_read)::int8 seq_tup_read, + sum(idx_scan)::int8 idx_scan, + sum(idx_tup_fetch)::int8 idx_tup_fetch, + sum(n_tup_ins)::int8 n_tup_ins, + sum(n_tup_upd)::int8 n_tup_upd, + sum(n_tup_del)::int8 n_tup_del, + sum(n_tup_hot_upd)::int8 n_tup_hot_upd, + sum(n_live_tup)::int8 n_live_tup, + sum(n_dead_tup)::int8 n_dead_tup, + sum(vacuum_count)::int8 vacuum_count, + sum(autovacuum_count)::int8 autovacuum_count, + sum(analyze_count)::int8 analyze_count, + sum(autoanalyze_count)::int8 autoanalyze_count, + max(tx_freeze_age)::int8 tx_freeze_age + from + q_tstats ts + join q_parts qp on qp.relid = ts.relid + join q_root_part qr on qr.oid = qp.root + group by + 1, 2, 3, 4 + ) x + order by table_size_b desc nulls last limit 300 + 16: |- + with recursive /* pgwatch_generated */ + q_root_part as ( + select c.oid, + c.relkind, + n.nspname root_schema, + c.relname root_relname + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + where relkind in ('p', 'r') + and relpersistence != 't' + and not n.nspname like any (array[E'pg\\_%', 'information_schema', E'\\_timescaledb%']) + and not exists(select * from pg_inherits where inhrelid = c.oid) + and exists(select * from pg_inherits where inhparent = c.oid) + ), + q_parts (relid, relkind, level, root) as ( + select oid, relkind, 1, oid + from q_root_part + union all + select inhrelid, c.relkind, level + 1, q.root + from pg_inherits i + join q_parts q on inhparent = q.relid + join pg_class c on c.oid = i.inhrelid + ), + q_tstats as ( + select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + relid, -- not sent to final output + quote_ident(schemaname) as tag_schema, + quote_ident(ut.relname) as tag_table_name, + quote_ident(schemaname) || '.' || quote_ident(ut.relname) as tag_table_full_name, + pg_table_size(relid) as table_size_b, + abs(greatest(ceil(log((pg_table_size(relid) + 1) / 10 ^ 6)), 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + pg_total_relation_size(relid) as total_relation_size_b, + case when c.reltoastrelid != 0 then pg_total_relation_size(c.reltoastrelid) else 0::int8 end as toast_size_b, + (extract(epoch from now() - greatest(last_vacuum, last_autovacuum)))::int8 as seconds_since_last_vacuum, + (extract(epoch from now() - greatest(last_analyze, last_autoanalyze)))::int8 as seconds_since_last_analyze, + case when 'autovacuum_enabled=off' = ANY (c.reloptions) then 1 else 0 end as no_autovacuum, + seq_scan, + seq_tup_read, + coalesce(idx_scan, 0) as idx_scan, + coalesce(idx_tup_fetch, 0) as idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + case when c.relkind != 'p' then age(c.relfrozenxid) else 0 end as tx_freeze_age, + extract(epoch from now() - last_seq_scan)::int8 as last_seq_scan_s + from pg_stat_user_tables ut + join pg_class c on c.oid = ut.relid + left join pg_class t on t.oid = c.reltoastrelid + left join pg_index ti on ti.indrelid = t.oid + left join pg_class tir on tir.oid = ti.indexrelid + where + -- leaving out fully locked tables as pg_relation_size also wants a lock and would wait + not exists (select 1 from pg_locks where relation = relid and mode = 'AccessExclusiveLock') + and c.relpersistence != 't' -- and temp tables + order by case when c.relkind = 'p' then 1e9::int else coalesce(c.relpages, 0) + coalesce(t.relpages, 0) + coalesce(tir.relpages, 0) end desc + limit 1500 /* NB! When changing the bottom final LIMIT also adjust this limit. Should be at least 5x bigger as approx sizes depend a lot on vacuum frequency. + The general idea is to reduce filesystem "stat"-ing on tables that won't make it to final output anyways based on approximate size */ + ) + + select /* pgwatch_generated */ + epoch_ns, + tag_schema, + tag_table_name, + tag_table_full_name, + 0 as is_part_root, + table_size_b, + tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + total_relation_size_b, + toast_size_b, + seconds_since_last_vacuum, + seconds_since_last_analyze, + no_autovacuum, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + tx_freeze_age, + last_seq_scan_s + from q_tstats + where not tag_schema like E'\\_timescaledb%' + and not exists (select * from q_root_part where oid = q_tstats.relid) + + union all + + select * from ( + select + epoch_ns, + quote_ident(qr.root_schema) as tag_schema, + quote_ident(qr.root_relname) as tag_table_name, + quote_ident(qr.root_schema) || '.' || quote_ident(qr.root_relname) as tag_table_full_name, + 1 as is_part_root, + sum(table_size_b)::int8 table_size_b, + abs(greatest(ceil(log((sum(table_size_b) + 1) / 10 ^ 6)), + 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + sum(total_relation_size_b)::int8 total_relation_size_b, + sum(toast_size_b)::int8 toast_size_b, + min(seconds_since_last_vacuum)::int8 seconds_since_last_vacuum, + min(seconds_since_last_analyze)::int8 seconds_since_last_analyze, + sum(no_autovacuum)::int8 no_autovacuum, + sum(seq_scan)::int8 seq_scan, + sum(seq_tup_read)::int8 seq_tup_read, + sum(idx_scan)::int8 idx_scan, + sum(idx_tup_fetch)::int8 idx_tup_fetch, + sum(n_tup_ins)::int8 n_tup_ins, + sum(n_tup_upd)::int8 n_tup_upd, + sum(n_tup_del)::int8 n_tup_del, + sum(n_tup_hot_upd)::int8 n_tup_hot_upd, + sum(n_live_tup)::int8 n_live_tup, + sum(n_dead_tup)::int8 n_dead_tup, + sum(vacuum_count)::int8 vacuum_count, + sum(autovacuum_count)::int8 autovacuum_count, + sum(analyze_count)::int8 analyze_count, + sum(autoanalyze_count)::int8 autoanalyze_count, + max(tx_freeze_age)::int8 tx_freeze_age, + min(last_seq_scan_s)::int8 last_seq_scan_s + from + q_tstats ts + join q_parts qp on qp.relid = ts.relid + join q_root_part qr on qr.oid = qp.root + group by + 1, 2, 3, 4 + ) x + order by table_size_b desc nulls last limit 300 + gauges: + - table_size_b + - total_relation_size_b + - toast_size_b + - seconds_since_last_vacuum + - seconds_since_last_analyze + - n_live_tup + - n_dead_tup + statement_timeout_seconds: 300 + table_stats_approx: + description: > + This metric collects approximate statistics about user tables, including size, vacuum status, and transaction freeze age. + It provides insights into the health and performance of tables in the database. + sqls: + 11: |- + with recursive /* pgwatch_generated */ + q_root_part as ( + select c.oid, + c.relkind, + n.nspname root_schema, + c.relname root_relname + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + where relkind in ('p', 'r') + and relpersistence != 't' + and not n.nspname like any (array[E'pg\\_%', 'information_schema', E'\\_timescaledb%']) + and not exists(select * from pg_inherits where inhrelid = c.oid) + and exists(select * from pg_inherits where inhparent = c.oid) + ), + q_parts (relid, relkind, level, root) as ( + select oid, relkind, 1, oid + from q_root_part + union all + select inhrelid, c.relkind, level + 1, q.root + from pg_inherits i + join q_parts q on inhparent = q.relid + join pg_class c on c.oid = i.inhrelid + ), + q_tstats as ( + with q_tbls_by_total_associated_relpages_approx as ( + select * from ( + select + c.oid, + c.relname, + c.relpages, + coalesce((select sum(relpages) from pg_class ci join pg_index i on i.indexrelid = ci.oid where i.indrelid = c.oid), 0) as index_relpages, + coalesce((select coalesce(ct.relpages, 0) + coalesce(cti.relpages, 0) from pg_class ct left join pg_index ti on ti.indrelid = ct.oid left join pg_class cti on cti.oid = ti.indexrelid where ct.oid = c.reltoastrelid), 0) as toast_relpages, + case when 'autovacuum_enabled=off' = ANY(c.reloptions) then 1 else 0 end as no_autovacuum, + case when c.relkind != 'p' then age(c.relfrozenxid) else 0 end as tx_freeze_age, + c.relpersistence + from + pg_class c + join pg_namespace n on n.oid = c.relnamespace + where + not n.nspname like any (array[E'pg\\_%', 'information_schema', E'\\_timescaledb%']) + and c.relkind = 'r' + and c.relpersistence != 't' + ) x + order by relpages + index_relpages + toast_relpages desc limit 300 + ), q_block_size as ( + select current_setting('block_size')::int8 as bs + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + relid, + quote_ident(schemaname)||'.'||quote_ident(ut.relname) as tag_table_full_name, + bs * relpages as table_size_b, + abs(greatest(ceil(log((bs*relpages+1) / 10^6)), 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + bs * (relpages + index_relpages + toast_relpages) as total_relation_size_b, + bs * toast_relpages as toast_size_b, + (extract(epoch from now() - greatest(last_vacuum, last_autovacuum)))::int8 as seconds_since_last_vacuum, + (extract(epoch from now() - greatest(last_analyze, last_autoanalyze)))::int8 as seconds_since_last_analyze, + no_autovacuum, + seq_scan, + seq_tup_read, + coalesce(idx_scan, 0) as idx_scan, + coalesce(idx_tup_fetch, 0) as idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + tx_freeze_age, + relpersistence + from + pg_stat_user_tables ut + join q_tbls_by_total_associated_relpages_approx t on t.oid = ut.relid + join q_block_size on true + where + -- leaving out fully locked tables as pg_relation_size also wants a lock and would wait + not exists (select 1 from pg_locks where relation = relid and mode = 'AccessExclusiveLock') + order by relpages desc + ) + select /* pgwatch_generated */ + epoch_ns, + tag_table_full_name, + 0 as is_part_root, + table_size_b, + tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + total_relation_size_b, + toast_size_b, + seconds_since_last_vacuum, + seconds_since_last_analyze, + no_autovacuum, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count, + tx_freeze_age + from q_tstats + where not exists (select * from q_root_part where oid = q_tstats.relid) + union all + select * from ( + select + epoch_ns, + quote_ident(qr.root_schema) || '.' || quote_ident(qr.root_relname) as tag_table_full_name, + 1 as is_part_root, + sum(table_size_b)::int8 table_size_b, + abs(greatest(ceil(log((sum(table_size_b) + 1) / 10 ^ 6)), + 0))::text as tag_table_size_cardinality_mb, -- i.e. 0=<1MB, 1=<10MB, 2=<100MB,.. + sum(total_relation_size_b)::int8 total_relation_size_b, + sum(toast_size_b)::int8 toast_size_b, + min(seconds_since_last_vacuum)::int8 seconds_since_last_vacuum, + min(seconds_since_last_analyze)::int8 seconds_since_last_analyze, + sum(no_autovacuum)::int8 no_autovacuum, + sum(seq_scan)::int8 seq_scan, + sum(seq_tup_read)::int8 seq_tup_read, + sum(idx_scan)::int8 idx_scan, + sum(idx_tup_fetch)::int8 idx_tup_fetch, + sum(n_tup_ins)::int8 n_tup_ins, + sum(n_tup_upd)::int8 n_tup_upd, + sum(n_tup_del)::int8 n_tup_del, + sum(n_tup_hot_upd)::int8 n_tup_hot_upd, + sum(n_live_tup)::int8 n_live_tup, + sum(n_dead_tup)::int8 n_dead_tup, + sum(vacuum_count)::int8 vacuum_count, + sum(autovacuum_count)::int8 autovacuum_count, + sum(analyze_count)::int8 analyze_count, + sum(autoanalyze_count)::int8 autoanalyze_count, + max(tx_freeze_age)::int8 tx_freeze_age + from + q_tstats ts + join q_parts qp on qp.relid = ts.relid + join q_root_part qr on qr.oid = qp.root + group by + 1, 2 + ) x; + + gauges: + - table_size_b + - total_relation_size_b + - toast_size_b + - seconds_since_last_vacuum + - seconds_since_last_analyze + - n_live_tup + - n_dead_tup + metric_storage_name: table_stats + unused_indexes: + description: > + This metric collects information about unused indexes in the database. + It helps identify indexes that are not being used and can potentially be dropped to improve performance. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + * + from ( + select + format('%I.%I', sui.schemaname, sui.indexrelname) as tag_index_full_name, + sui.idx_scan, + coalesce(pg_relation_size(sui.indexrelid), 0) as index_size_b, + system_identifier::text as tag_sys_id /* to easily check also all replicas as could be still used there */ + from + pg_stat_user_indexes sui + join pg_index i on i.indexrelid = sui.indexrelid + join pg_control_system() on true + where not sui.schemaname like E'pg\\_temp%' + and idx_scan = 0 + and not (indisprimary or indisunique or indisexclusion) + and not exists (select * from pg_locks where relation = sui.relid and mode = 'AccessExclusiveLock') + ) x + where index_size_b > 100*1024^2 /* list >100MB only */ + order by index_size_b desc + limit 25 + vmstat: + description: > + This metric collects system-level statistics using the `vmstat` command. + It provides insights into memory usage, CPU load, and other system metrics. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + r, b, swpd, free, buff, cache, si, so, bi, bo, "in", cs, us, sy, id, wa, st, cpu_count, load_1m, load_5m, load_15m, total_memory + from + get_vmstat() + init_sql: |- + CREATE EXTENSION IF NOT EXISTS plpython3u; + + CREATE OR REPLACE FUNCTION get_vmstat( + IN delay int default 1, + OUT r int, OUT b int, OUT swpd int8, OUT free int8, OUT buff int8, OUT cache int8, OUT si int8, OUT so int8, OUT bi int8, + OUT bo int8, OUT "in" int, OUT cs int, OUT us int, OUT sy int, OUT id int, OUT wa int, OUT st int, + OUT cpu_count int, OUT load_1m float4, OUT load_5m float4, OUT load_15m float4, OUT total_memory int8 + ) + LANGUAGE plpython3u + AS $FUNCTION$ + from os import cpu_count, popen + unit = 1024 # 'vmstat' default block byte size + + cpu_count = cpu_count() + vmstat_lines = popen('vmstat {} 2'.format(delay)).readlines() + vm = [int(x) for x in vmstat_lines[-1].split()] + # plpy.notice(vm) + load_1m, load_5m, load_15m = None, None, None + with open('/proc/loadavg', 'r') as f: + la_line = f.readline() + if la_line: + splits = la_line.split() + if len(splits) == 5: + load_1m, load_5m, load_15m = splits[0], splits[1], splits[2] + + total_memory = None + with open('/proc/meminfo', 'r') as f: + mi_line = f.readline() + splits = mi_line.split() + # plpy.notice(splits) + if len(splits) == 3: + total_memory = int(splits[1]) * 1024 + + return vm[0], vm[1], vm[2] * unit, vm[3] * unit, vm[4] * unit, vm[5] * unit, vm[6] * unit, vm[7] * unit, vm[8] * unit, \ + vm[9] * unit, vm[10], vm[11], vm[12], vm[13], vm[14], vm[15], vm[16], cpu_count, load_1m, load_5m, load_15m, total_memory + $FUNCTION$; + + GRANT EXECUTE ON FUNCTION get_vmstat(int) TO pgwatch; + COMMENT ON FUNCTION get_vmstat(int) IS 'created for pgwatch'; + wait_events: + description: > + This metric collects information about active queries that are waiting for events in the database. + It provides insights into query performance and potential bottlenecks. + sqls: + 11: |- + with q_sa as ( + select * from pg_stat_activity where datname = current_database() and pid <> pg_backend_pid() + ) + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + wait_event_type as tag_wait_event_type, + wait_event as tag_wait_event, + count(*), + avg(abs(1e6* extract(epoch from now() - query_start)))::int8 as avg_query_duration_us, + max(abs(1e6* extract(epoch from now() - query_start)))::int8 as max_query_duration_us, + (select count(*) from q_sa where state = 'active') as total_active + from + q_sa + where + state = 'active' + and wait_event_type is not null + and wait_event_type <> 'Timeout' + group by + 1, 2, 3 + wal: + description: > + This metric collects information about the Write-Ahead Logging (WAL) system in PostgreSQL. + It provides insights into WAL activity, including the current WAL location, replay lag, and other related metrics. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + case + when pg_is_in_recovery() = false then + pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::int8 + else + pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '0/0')::int8 + end as xlog_location_b, + case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, + extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, + system_identifier::text as tag_sys_id, + case + when pg_is_in_recovery() = false then + ('x'||substr(pg_walfile_name(pg_current_wal_lsn()), 1, 8))::bit(32)::int + else + (select min_recovery_end_timeline::int from pg_control_recovery()) + end as timeline + from pg_control_system() + gauges: + - '*' + is_instance_level: true + wal_receiver: + description: > + This metric collects information about the WAL receiver process in PostgreSQL. + It provides insights into the status of the WAL receiver, including replay lag and last replay timestamp. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::int8 as replay_lag_b, + extract(epoch from (now() - pg_last_xact_replay_timestamp()))::int8 as last_replay_s + node_status: standby + gauges: + - '*' + is_instance_level: true + wal_size: + description: > + This metric collects the size of the Write-Ahead Log (WAL) directory in PostgreSQL. + It provides insights into the total size of WAL files currently stored in the database. + sqls: + 11: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + sum(size)::int8 as wal_size_b + from pg_ls_waldir() + gauges: + - '*' + is_instance_level: true + wal_stats: + description: > + This metric collects statistics about the Write-Ahead Logging (WAL) system in PostgreSQL. + It provides insights into WAL activity, including the number of records, full page images, and write/sync times. + sqls: + 14: |- + select /* pgwatch_generated */ + (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + wal_records, + wal_fpi, + (wal_bytes / 1024)::int8 as wal_bytes_kb, + wal_buffers_full, + wal_write, + wal_sync, + wal_write_time::int8, + wal_sync_time::int8 + from + pg_stat_wal +presets: + aiven: + description: aiven database metrics + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_col_stats: 86400 + index_stats: 900 + locks: 60 + locks_mode: 60 + recommendations: 43200 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_activity: 60 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 300 + table_stats: 300 + wal: 60 + wal_receiver: 120 + aurora: + description: AWS Aurora doesn't expose all Postgres functions and there's no WAL + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats_aurora: 60 + index_stats: 900 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + settings: 7200 + sproc_stats: 180 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal_receiver: 120 + azure: + description: similar to 'exhaustive' with stuff that's not accessible on Azure Database for PostgreSQL removed + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_stats: 900 + kpi: 120 + locks: 60 + locks_mode: 60 + replication: 60 + replication_slots: 60 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_ssl: 60 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + wal: 60 + wal_receiver: 60 + wal_size: 300 + basic: + description: only the most important metrics - WAL, DB-level statistics (size, tx and backend counts) + metrics: + instance_up: 60 + db_size: 300 + db_stats: 60 + wal: 60 + exhaustive: + description: all important metrics for a deeper performance understanding + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + cpu_load: 60 + db_size: 300 + db_stats: 60 + index_stats: 900 + instance_up: 60 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_activity: 30 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal: 60 + wal_receiver: 120 + wal_size: 300 + full: + description: almost all available metrics for a even deeper performance understanding + metrics: + archiver: 60 + archiver_pending_count: 300 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + cpu_load: 60 + datfrozenxid: 3600 + db_size: 300 + db_stats: 60 + index_stats: 900 + instance_up: 60 + kpi: 120 + locks: 60 + locks_mode: 60 + logical_subscriptions: 120 + postgres_role: 60 + psutil_cpu: 120 + psutil_disk: 120 + psutil_disk_io_total: 120 + psutil_mem: 120 + recommendations: 43200 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + server_log_event_counts: 60 + settings: 7200 + sproc_stats: 180 + stat_activity: 30 + stat_ssl: 120 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal: 60 + wal_receiver: 120 + wal_size: 120 + gce: + description: similar to 'exhaustive' with stuff not accessible on GCE managed PostgreSQL engine removed + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_stats: 900 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal: 60 + wal_receiver: 120 + minimal: + description: single "Key Performance Indicators" query for fast cluster/db overview + metrics: + instance_up: 60 + kpi: 60 + pgbouncer: + description: pgbouncer stats + metrics: + pgbouncer_stats: 60 + pgbouncer_clients: 60 + pgpool: + description: pgpool stats + metrics: + pgpool_stats: 60 + pgpool_processes: 60 + prometheus-async: + description: Tuned for the Prometheus async scrapping + metrics: + backends: 30 + bgwriter: 60 + checkpointer: 60 + db_size: 300 + db_stats: 30 + locks_mode: 30 + replication: 120 + replication_slots: 120 + settings: 300 + sproc_stats: 180 + stat_statements_calls: 60 + table_io_stats: 300 + table_stats: 300 + wait_events: 60 + wal: 60 + rds: + description: similar to 'exhaustive' with stuff that's not accessible on AWS RDS removed + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_stats: 900 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_activity: 30 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal: 60 + wal_receiver: 120 + standard: + description: basic level + table, index, stat_statements stats + metrics: + cpu_load: 60 + db_size: 300 + db_stats: 60 + index_stats: 900 + instance_up: 60 + sequence_health: 3600 + sproc_stats: 180 + stat_statements: 180 + table_stats: 300 + wal: 60 + exhaustive_no_python: + description: like exhaustive, but no PL/Python helpers + metrics: + archiver: 60 + backends: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_stats: 900 + instance_up: 60 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_activity: 30 + stat_statements: 180 + stat_statements_calls: 60 + table_bloat_approx_summary_sql: 7200 + table_io_stats: 600 + table_stats: 300 + wal: 60 + wal_receiver: 120 + wal_size: 300 + unprivileged: + description: no wrappers + only pg_stat_statements extension expected (developer mode) + metrics: + archiver: 60 + bgwriter: 60 + checkpointer: 60 + change_events: 300 + db_size: 300 + db_stats: 60 + index_stats: 900 + instance_up: 60 + locks: 60 + locks_mode: 60 + replication: 120 + replication_slots: 120 + sequence_health: 3600 + settings: 7200 + sproc_stats: 180 + stat_statements_calls: 60 + table_io_stats: 600 + table_stats: 300 + wal: 60 \ No newline at end of file diff --git a/config/pgwatch-prometheus/sources.yml b/config/pgwatch-prometheus/sources.yml index 66a424f..c6cea94 100644 --- a/config/pgwatch-prometheus/sources.yml +++ b/config/pgwatch-prometheus/sources.yml @@ -5,7 +5,24 @@ conn_str: postgresql://pgwatch_monitor:monitor_pass@target-db:5432/target_database kind: postgres custom_metrics: - pg_stat_statements_calls: 30 + pg_stat_statements_calls: 1 + backends: 1 + bgwriter: 1 + checkpointer: 1 + db_size: 1 + db_stats: 1 + locks_mode: 1 + replication: 1 + replication_slots: 1 + settings: 1 + sproc_stats: 1 + stat_statements: 1 + stat_statements_calls: 1 + table_io_stats: 1 + table_stats: 1 + wait_events: 1 + wal: 1 + custom_tags: env: demo cluster: local diff --git a/config/prometheus/prometheus.yml b/config/prometheus/prometheus.yml index 61a876b..d3eb569 100644 --- a/config/prometheus/prometheus.yml +++ b/config/prometheus/prometheus.yml @@ -1,6 +1,7 @@ global: - scrape_interval: 1s - evaluation_interval: 1s + scrape_interval: 15s # Default scrape interval + evaluation_interval: 15s # Default evaluation interval + scrape_timeout: 10s # Global scrape timeout rule_files: # - "first_rules.yml" @@ -10,5 +11,6 @@ scrape_configs: - job_name: 'pgwatch-prometheus' static_configs: - targets: ['pgwatch-prometheus:9091'] - scrape_interval: 30s + scrape_interval: 30s # How often to scrape PGWatch + scrape_timeout: 25s # Timeout for each scrape (must be < scrape_interval) metrics_path: /pgwatch \ No newline at end of file diff --git a/old-metrics.yml b/old-metrics.yml new file mode 100644 index 0000000..e69de29 -- GitLab