summaryrefslogtreecommitdiff
path: root/check_postgres.pl
diff options
context:
space:
mode:
Diffstat (limited to 'check_postgres.pl')
-rwxr-xr-xcheck_postgres.pl288
1 files changed, 211 insertions, 77 deletions
diff --git a/check_postgres.pl b/check_postgres.pl
index eebed9bfe..8a71853ad 100755
--- a/check_postgres.pl
+++ b/check_postgres.pl
@@ -99,8 +99,12 @@ our %msg = (
'bloat-nomin' => q{no relations meet the minimum bloat criteria},
'bloat-table' => q{(db $1) table $2.$3 rows:$4 pages:$5 shouldbe:$6 ($7X) wasted size:$8 ($9)},
'bug-report' => q{Please report these details to [email protected]:},
+ 'checkcluster-id' => q{Database system identifier:},
+ 'checkcluster-msg' => q{cluster_id: $1},
+ 'checkcluster-nomrtg'=> q{Must provide a number via the --mrtg option},
'checkmode-state' => q{Database cluster state:},
'checkmode-recovery' => q{in archive recovery},
+ 'checkmode-prod' => q{in production},
'checkpoint-baddir' => q{Invalid data_directory: "$1"},
'checkpoint-baddir2' => q{pg_controldata could not read the given data directory: "$1"},
'checkpoint-badver' => q{Failed to run pg_controldata - probably the wrong version ($1)},
@@ -108,7 +112,7 @@ our %msg = (
'checkpoint-nodir' => q{Must supply a --datadir argument or set the PGDATA environment variable},
'checkpoint-nodp' => q{Must install the Perl module Date::Parse to use the checkpoint action},
'checkpoint-noparse' => q{Unable to parse pg_controldata output: "$1"},
- 'checkpoint-noregex' => q{Call to pg_controldata $1 failed},
+ 'checkpoint-noregex' => q{Unable to find the regex for this check},
'checkpoint-nosys' => q{Could not call pg_controldata: $1},
'checkpoint-ok' => q{Last checkpoint was 1 second ago},
'checkpoint-ok2' => q{Last checkpoint was $1 seconds ago},
@@ -160,6 +164,7 @@ our %msg = (
'logfile-stderr' => q{Logfile output has been redirected to stderr: please provide a filename},
'logfile-syslog' => q{Database is using syslog, please specify path with --logfile option (fac=$1)},
'mode-standby' => q{Server in standby mode},
+ 'mode' => q{mode},
'mrtg-fail' => q{Action $1 failed: $2},
'new-ver-nocver' => q{Could not download version information for $1},
'new-ver-badver' => q{Could not parse version information for $1},
@@ -353,7 +358,7 @@ our %msg = (
'checkpoint-nodir' => q{Vous devez fournir un argument --datadir ou configurer la variable d'environnement PGDATA},
'checkpoint-nodp' => q{Vous devez installer le module Perl Date::Parse pour utiliser l'action checkpoint},
'checkpoint-noparse' => q{Incapable d'analyser le résultat de la commande pg_controldata : "$1"},
- 'checkpoint-noregex' => q{Échec de l'appel à pg_controldata $1},
+ 'checkpoint-noregex' => q{La regex pour ce test n'a pas été trouvée},
'checkpoint-nosys' => q{N'a pas pu appeler pg_controldata : $1},
'checkpoint-ok' => q{Le dernier CHECKPOINT est survenu il y a une seconde},
'checkpoint-ok2' => q{Le dernier CHECKPOINT est survenu il y a $1 secondes},
@@ -931,6 +936,7 @@ GetOptions(
'debugoutput=s',
'no-check_postgresrc',
'assume-standby-mode',
+ 'assume-prod',
'action=s',
'warning=s',
@@ -1112,6 +1118,7 @@ our $action_info = {
backends => [1, 'Number of connections, compared to max_connections.'],
bloat => [0, 'Check for table and index bloat.'],
checkpoint => [1, 'Checks how long since the last checkpoint'],
+ cluster_id => [1, 'Checks the Database System Identifier'],
commitratio => [0, 'Report if the commit ratio of a database is too low.'],
connection => [0, 'Simple connection check.'],
custom_query => [0, 'Run a custom query.'],
@@ -1202,6 +1209,7 @@ Limit options:
Other options:
--assume-standby-mode assume that server in continious WAL recovery mode
+ --assume-prod assume that server in production mode
--PSQL=FILE location of the psql executable; avoid using if possible
-v, --verbose verbosity level; can be used more than once to increase the level
-h, --help display this help information
@@ -1248,7 +1256,9 @@ if ($opt{showtime}) {
## Check the current database mode
our $STANDBY = 0;
+our $MASTER = 0;
make_sure_standby_mode() if $opt{'assume-standby-mode'};
+make_sure_prod() if $opt{'assume-prod'};
## We don't (usually) want to die, but want a graceful Nagios-like exit instead
sub ndie {
@@ -1517,59 +1527,37 @@ sub do_mrtg_stats {
do_mrtg({one => $one, two => $two, msg => $msg});
}
-sub make_sure_standby_mode {
+sub make_sure_mode_is {
- ## Checks if database in standby mode
## Requires $ENV{PGDATA} or --datadir
- ## Find the data directory, make sure it exists
- my $dir = $opt{datadir} || $ENV{PGDATA};
-
- if (!defined $dir or ! length $dir) {
- ndie msg('checkpoint-nodir');
- }
-
- if (! -d $dir) {
- ndie msg('checkpoint-baddir', $dir);
- }
-
$db->{host} = '<none>';
## Run pg_controldata, grab the mode
- my $pgc
- = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA}
- : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata"
- : 'pg_controldata';
- $COM = qq{$pgc "$dir"};
- eval {
- $res = qx{$COM 2>&1};
- };
- if ($@) {
- ndie msg('checkpoint-nosys', $@);
- }
-
- ## If the path is echoed back, we most likely have an invalid data dir
- if ($res =~ /$dir/) {
- ndie msg('checkpoint-baddir2', $dir);
- }
-
- if ($res =~ /WARNING: Calculated CRC checksum/) {
- ndie msg('checkpoint-badver', $dir);
- }
- if ($res !~ /^pg_control.+\d+/) {
- ndie msg('checkpoint-badver2');
- }
+ $res = open_controldata();
my $regex = msg('checkmode-state');
if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture)
## Just in case, check the English one as well
$regex = msg_en('checkmode-state');
if ($res !~ /$regex\s*(.+)/) {
- ndie msg('checkpoint-noregex', $dir);
+ ndie msg('checkpoint-noregex');
}
}
my $last = $1;
- $regex = msg('checkmode-recovery');
+
+ return $last;
+
+}
+
+sub make_sure_standby_mode {
+
+ ## Checks if database in standby mode
+ ## Requires $ENV{PGDATA} or --datadir
+
+ my $last = make_sure_mode_is();
+
+ my $regex = msg('checkmode-recovery');
if ($last =~ /$regex/) {
$STANDBY = 1;
}
@@ -1578,6 +1566,21 @@ sub make_sure_standby_mode {
} ## end of make_sure_standby_mode
+sub make_sure_prod {
+
+ ## Checks if database in production mode
+ ## Requires $ENV{PGDATA} or --datadir
+
+ my $last = make_sure_mode_is();
+
+ my $regex = msg('checkmode-prod');
+ if ($last =~ /$regex/) {
+ $MASTER = 1;
+ }
+
+ return;
+
+} ## end of make_sure_production_mode
sub finishup {
@@ -1919,6 +1922,9 @@ check_dbstats() if $action eq 'dbstats';
## Check how long since the last checkpoint
check_checkpoint() if $action eq 'checkpoint';
+## Check the Datasbae System Identifier
+check_cluster_id() if $action eq 'cluster_id';
+
## Check for disabled triggers
check_disabled_triggers() if $action eq 'disabled_triggers';
@@ -3114,6 +3120,50 @@ sub perfname {
} ## end of perfname;
+sub open_controldata {
+ ## Requires $ENV{PGDATA} or --datadir
+
+ ## Find the data directory, make sure it exists
+ my $dir = $opt{datadir} || $ENV{PGDATA};
+
+ if (!defined $dir or ! length $dir) {
+ ndie msg('checkpoint-nodir');
+ }
+
+ if (! -d $dir) {
+ ndie msg('checkpoint-baddir', $dir);
+ }
+
+ ## Run pg_controldata
+ my $pgc
+ = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA}
+ : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata"
+ : 'pg_controldata';
+ $COM = qq{$pgc "$dir"};
+ eval {
+ $res = qx{$COM 2>&1};
+ };
+ if ($@) {
+ ndie msg('checkpoint-nosys', $@);
+ }
+
+ ## If the path is echoed back, we most likely have an invalid data dir
+ if ($res =~ /$dir/) {
+ ndie msg('checkpoint-baddir2', $dir);
+ }
+
+ if ($res =~ /WARNING: Calculated CRC checksum/) {
+ ndie msg('checkpoint-badver', $dir);
+ }
+ if ($res !~ /^pg_control.+\d+/) {
+ ndie msg('checkpoint-badver2');
+ }
+
+ ## return the pg_controldata output
+ return $res;
+}
+
+
sub check_archive_ready {
## Check on the number of WAL archive with status "ready"
@@ -3632,7 +3682,6 @@ FROM (
} ## end of check_bloat
-
sub check_checkpoint {
## Checks how long in seconds since the last checkpoint on a WAL slave
@@ -3645,6 +3694,9 @@ sub check_checkpoint {
## may make more sense on the master, or we may want to look at
## the WAL segments received/processed instead of the checkpoint
## timestamp.
+ ## This checks can use the optionnal --asume-standby-mode or
+ ## --assume-prod: if the mode found is not the mode assumed, a
+ ## CRITICAL is emitted.
## Supports: Nagios, MRTG
## Warning and critical are seconds
@@ -3657,50 +3709,17 @@ sub check_checkpoint {
forcemrtg => 1,
});
- ## Find the data directory, make sure it exists
- my $dir = $opt{datadir} || $ENV{PGDATA};
-
- if (!defined $dir or ! length $dir) {
- ndie msg('checkpoint-nodir');
- }
-
- if (! -d $dir) {
- ndie msg('checkpoint-baddir', $dir);
- }
-
$db->{host} = '<none>';
## Run pg_controldata, grab the time
- my $pgc
- = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA}
- : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata"
- : 'pg_controldata';
- $COM = qq{$pgc "$dir"};
- eval {
- $res = qx{$COM 2>&1};
- };
- if ($@) {
- ndie msg('checkpoint-nosys', $@);
- }
-
- ## If the path is echoed back, we most likely have an invalid data dir
- if ($res =~ /$dir/) {
- ndie msg('checkpoint-baddir2', $dir);
- }
-
- if ($res =~ /WARNING: Calculated CRC checksum/) {
- ndie msg('checkpoint-badver', $pgc);
- }
- if ($res !~ /^pg_control.+\d+/) {
- ndie msg('checkpoint-badver2');
- }
+ $res = open_controldata();
my $regex = msg('checkpoint-po');
if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture)
## Just in case, check the English one as well
$regex = msg_en('checkpoint-po');
if ($res !~ /$regex\s*(.+)/) {
- ndie msg('checkpoint-noregex', $dir);
+ ndie msg('checkpoint-noregex');
}
}
my $last = $1;
@@ -3722,11 +3741,35 @@ sub check_checkpoint {
$db->{perf} = sprintf '%s=%s;%s;%s',
perfname(msg('age')), $diff, $warning, $critical;
+ my $mode = '';
+ if ($STANDBY) {
+ $mode = 'STANDBY';
+ }
+ if ($MASTER) {
+ $mode = 'MASTER';
+ }
+
+ ## If we have an assume flag, then honor it.
+ my $goodmode = 1;
+ if ($opt{'assume-standby-mode'} and not $STANDBY) {
+ $goodmode = 0;
+ $mode = 'NOT STANDBY';
+ }
+ elsif ($opt{'assume-prod'} and not $MASTER) {
+ $goodmode = 0;
+ $mode = 'NOT MASTER';
+ }
+
+ if (length($mode) > 0) {
+ $db->{perf} .= sprintf ' %s=%s',
+ perfname(msg('mode')), $mode;
+ }
+
if ($MRTG) {
do_mrtg({one => $diff, msg => $msg});
}
- if (length $critical and $diff >= $critical) {
+ if ((length $critical and $diff >= $critical) or not $goodmode) {
add_critical $msg;
return;
}
@@ -3743,6 +3786,58 @@ sub check_checkpoint {
} ## end of check_checkpoint
+sub check_cluster_id {
+
+
+ ## Verify the Database System Identifier provided by pg_controldata
+ ## Supports: Nagios, MRTG
+ ## One of warning or critical must be given (but not both)
+ ## It should run one time to find out the expected cluster-id
+ ## You can use --critical="0" to find out the current cluster-id
+ ## You can include or exclude settings as well
+ ## Example:
+ ## check_postgres_cluster_id --critical="5633695740047915125"
+
+ my ($warning, $critical) = validate_range({type => 'integer', onlyone => 1});
+
+ $db->{host} = '<none>';
+
+ ## Run pg_controldata, grab the cluster-id
+ $res = open_controldata();
+
+ my $regex = msg('checkcluster-id');
+ if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture)
+ ## Just in case, check the English one as well
+ $regex = msg_en('checkcluster-id');
+ if ($res !~ /$regex\s*(.+)/) {
+ ndie msg('checkpoint-noregex');
+ }
+ }
+ my $ident = $1;
+
+ my $msg = msg('checkcluster-msg', $ident);
+ if ($MRTG) {
+ $opt{mrtg} or ndie msg('checksum-nomrtg');
+ do_mrtg({one => $opt{mrtg} eq $ident ? 1 : 0, msg => $ident});
+ }
+ if ($critical and $critical ne $ident) {
+ add_critical $msg;
+ }
+ elsif ($warning and $warning ne $ident) {
+ add_warning $msg;
+ }
+ elsif (!$critical and !$warning) {
+ add_unknown $msg;
+ }
+ else {
+ add_ok $msg;
+ }
+
+ return;
+
+} ## end of check_cluster_id
+
+
sub check_commitratio {
## Check the commitratio of one or more databases
@@ -7814,6 +7909,16 @@ Example:
postgres@db$./check_postgres.pl --action=version --warning=8.1 --datadir /var/lib/postgresql/8.3/main/ --assume-standby-mode
POSTGRES_VERSION OK: Server in standby mode | time=0.00
+=item B<--assume-prod>
+
+If specified, check if server in production mode is performed (--datadir is required).
+The option is only relevant for (C<symlink: check_postgres_checkpoint>).
+
+Example:
+
+ postgres@db$./check_postgres.pl --action=checkpoint --datadir /var/lib/postgresql/8.3/main/ --assume-prod
+ POSTGRES_CHECKPOINT OK: Last checkpoint was 72 seconds ago | age=72;;300 mode=MASTER
+
=item B<-h> or B<--help>
Displays a help screen with a summary of all actions and options.
@@ -8084,6 +8189,8 @@ was run, as determined by parsing the call to C<pg_controldata>. Because of this
pg_controldata executable must be available in the current path. Alternatively, you can
set the environment variable C<PGCONTROLDATA> to the exact location of the pg_controldata
executable, or you can specify C<PGBINDIR> as the directory that it lives in.
+It is also possible to use the special options I<--assume-prod> or
+I<--assume-standby-mode>, if the mode found is not the one expected, a CRITICAL is emitted.
At least one warning or critical argument must be set.
@@ -8091,6 +8198,27 @@ This action requires the Date::Parse module.
For MRTG or simple output, returns the number of seconds.
+=head2 B<cluster_id>
+
+(C<symlink: check_postgres_cluster-id>) Checks that the Database System Identifier
+provided by pg_controldata is the same as last time you checked. This must run on the same
+server as the database that is being checked (e.g. the -h flag will not work).
+Either the I<--warning> or the I<--critical> option should be given, but not both. The value
+of each one is the cluster identifier, an integer value. You can run with the special C<--critical=0> option
+to find out an existing cluster identifier.
+
+Example 1: Find the initial identifier
+
+ check_postgres_cluster_id --critical=0 --datadir=/var//lib/postgresql/9.0/main
+
+Example 2: Make sure the cluster is the same and warn if not, using the result from above.
+
+ check_postgres_cluster_id --critical=5633695740047915135
+
+For MRTG output, returns a 1 or 0 indicating success of failure of the identifier to match. A
+identifier must be provided as the C<--mrtg> argument. The fourth line always gives the
+current identifier.
+
=head2 B<commitratio>
(C<symlink: check_postgres_commitratio>) Checks the commit ratio of all databases and complains when they are too low.
@@ -9366,6 +9494,12 @@ Items not specifically attributed are by GSM (Greg Sabino Mullane).
=over 4
+=item B<Version 2.19.0>
+
+ Add the --assume-prod option (Cédric Villemain)
+ Add the cluster_id check (Cédric Villemain)
+ Improve settings_checksum and checkpoint tests (Cédric Villemain)
+
=item B<Version 2.18.1>
Do no do an inner join to pg_user when checking database size