diff options
Diffstat (limited to 'check_postgres.pl')
-rwxr-xr-x | check_postgres.pl | 288 |
1 files changed, 211 insertions, 77 deletions
diff --git a/check_postgres.pl b/check_postgres.pl index eebed9bfe..8a71853ad 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -99,8 +99,12 @@ our %msg = ( 'bloat-nomin' => q{no relations meet the minimum bloat criteria}, 'bloat-table' => q{(db $1) table $2.$3 rows:$4 pages:$5 shouldbe:$6 ($7X) wasted size:$8 ($9)}, 'bug-report' => q{Please report these details to [email protected]:}, + 'checkcluster-id' => q{Database system identifier:}, + 'checkcluster-msg' => q{cluster_id: $1}, + 'checkcluster-nomrtg'=> q{Must provide a number via the --mrtg option}, 'checkmode-state' => q{Database cluster state:}, 'checkmode-recovery' => q{in archive recovery}, + 'checkmode-prod' => q{in production}, 'checkpoint-baddir' => q{Invalid data_directory: "$1"}, 'checkpoint-baddir2' => q{pg_controldata could not read the given data directory: "$1"}, 'checkpoint-badver' => q{Failed to run pg_controldata - probably the wrong version ($1)}, @@ -108,7 +112,7 @@ our %msg = ( 'checkpoint-nodir' => q{Must supply a --datadir argument or set the PGDATA environment variable}, 'checkpoint-nodp' => q{Must install the Perl module Date::Parse to use the checkpoint action}, 'checkpoint-noparse' => q{Unable to parse pg_controldata output: "$1"}, - 'checkpoint-noregex' => q{Call to pg_controldata $1 failed}, + 'checkpoint-noregex' => q{Unable to find the regex for this check}, 'checkpoint-nosys' => q{Could not call pg_controldata: $1}, 'checkpoint-ok' => q{Last checkpoint was 1 second ago}, 'checkpoint-ok2' => q{Last checkpoint was $1 seconds ago}, @@ -160,6 +164,7 @@ our %msg = ( 'logfile-stderr' => q{Logfile output has been redirected to stderr: please provide a filename}, 'logfile-syslog' => q{Database is using syslog, please specify path with --logfile option (fac=$1)}, 'mode-standby' => q{Server in standby mode}, + 'mode' => q{mode}, 'mrtg-fail' => q{Action $1 failed: $2}, 'new-ver-nocver' => q{Could not download version information for $1}, 'new-ver-badver' => q{Could not parse version information for $1}, @@ -353,7 +358,7 @@ our %msg = ( 'checkpoint-nodir' => q{Vous devez fournir un argument --datadir ou configurer la variable d'environnement PGDATA}, 'checkpoint-nodp' => q{Vous devez installer le module Perl Date::Parse pour utiliser l'action checkpoint}, 'checkpoint-noparse' => q{Incapable d'analyser le résultat de la commande pg_controldata : "$1"}, - 'checkpoint-noregex' => q{Échec de l'appel à pg_controldata $1}, + 'checkpoint-noregex' => q{La regex pour ce test n'a pas été trouvée}, 'checkpoint-nosys' => q{N'a pas pu appeler pg_controldata : $1}, 'checkpoint-ok' => q{Le dernier CHECKPOINT est survenu il y a une seconde}, 'checkpoint-ok2' => q{Le dernier CHECKPOINT est survenu il y a $1 secondes}, @@ -931,6 +936,7 @@ GetOptions( 'debugoutput=s', 'no-check_postgresrc', 'assume-standby-mode', + 'assume-prod', 'action=s', 'warning=s', @@ -1112,6 +1118,7 @@ our $action_info = { backends => [1, 'Number of connections, compared to max_connections.'], bloat => [0, 'Check for table and index bloat.'], checkpoint => [1, 'Checks how long since the last checkpoint'], + cluster_id => [1, 'Checks the Database System Identifier'], commitratio => [0, 'Report if the commit ratio of a database is too low.'], connection => [0, 'Simple connection check.'], custom_query => [0, 'Run a custom query.'], @@ -1202,6 +1209,7 @@ Limit options: Other options: --assume-standby-mode assume that server in continious WAL recovery mode + --assume-prod assume that server in production mode --PSQL=FILE location of the psql executable; avoid using if possible -v, --verbose verbosity level; can be used more than once to increase the level -h, --help display this help information @@ -1248,7 +1256,9 @@ if ($opt{showtime}) { ## Check the current database mode our $STANDBY = 0; +our $MASTER = 0; make_sure_standby_mode() if $opt{'assume-standby-mode'}; +make_sure_prod() if $opt{'assume-prod'}; ## We don't (usually) want to die, but want a graceful Nagios-like exit instead sub ndie { @@ -1517,59 +1527,37 @@ sub do_mrtg_stats { do_mrtg({one => $one, two => $two, msg => $msg}); } -sub make_sure_standby_mode { +sub make_sure_mode_is { - ## Checks if database in standby mode ## Requires $ENV{PGDATA} or --datadir - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = '<none>'; ## Run pg_controldata, grab the mode - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $dir); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; - $regex = msg('checkmode-recovery'); + + return $last; + +} + +sub make_sure_standby_mode { + + ## Checks if database in standby mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-recovery'); if ($last =~ /$regex/) { $STANDBY = 1; } @@ -1578,6 +1566,21 @@ sub make_sure_standby_mode { } ## end of make_sure_standby_mode +sub make_sure_prod { + + ## Checks if database in production mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-prod'); + if ($last =~ /$regex/) { + $MASTER = 1; + } + + return; + +} ## end of make_sure_production_mode sub finishup { @@ -1919,6 +1922,9 @@ check_dbstats() if $action eq 'dbstats'; ## Check how long since the last checkpoint check_checkpoint() if $action eq 'checkpoint'; +## Check the Datasbae System Identifier +check_cluster_id() if $action eq 'cluster_id'; + ## Check for disabled triggers check_disabled_triggers() if $action eq 'disabled_triggers'; @@ -3114,6 +3120,50 @@ sub perfname { } ## end of perfname; +sub open_controldata { + ## Requires $ENV{PGDATA} or --datadir + + ## Find the data directory, make sure it exists + my $dir = $opt{datadir} || $ENV{PGDATA}; + + if (!defined $dir or ! length $dir) { + ndie msg('checkpoint-nodir'); + } + + if (! -d $dir) { + ndie msg('checkpoint-baddir', $dir); + } + + ## Run pg_controldata + my $pgc + = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} + : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" + : 'pg_controldata'; + $COM = qq{$pgc "$dir"}; + eval { + $res = qx{$COM 2>&1}; + }; + if ($@) { + ndie msg('checkpoint-nosys', $@); + } + + ## If the path is echoed back, we most likely have an invalid data dir + if ($res =~ /$dir/) { + ndie msg('checkpoint-baddir2', $dir); + } + + if ($res =~ /WARNING: Calculated CRC checksum/) { + ndie msg('checkpoint-badver', $dir); + } + if ($res !~ /^pg_control.+\d+/) { + ndie msg('checkpoint-badver2'); + } + + ## return the pg_controldata output + return $res; +} + + sub check_archive_ready { ## Check on the number of WAL archive with status "ready" @@ -3632,7 +3682,6 @@ FROM ( } ## end of check_bloat - sub check_checkpoint { ## Checks how long in seconds since the last checkpoint on a WAL slave @@ -3645,6 +3694,9 @@ sub check_checkpoint { ## may make more sense on the master, or we may want to look at ## the WAL segments received/processed instead of the checkpoint ## timestamp. + ## This checks can use the optionnal --asume-standby-mode or + ## --assume-prod: if the mode found is not the mode assumed, a + ## CRITICAL is emitted. ## Supports: Nagios, MRTG ## Warning and critical are seconds @@ -3657,50 +3709,17 @@ sub check_checkpoint { forcemrtg => 1, }); - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = '<none>'; ## Run pg_controldata, grab the time - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $pgc); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; @@ -3722,11 +3741,35 @@ sub check_checkpoint { $db->{perf} = sprintf '%s=%s;%s;%s', perfname(msg('age')), $diff, $warning, $critical; + my $mode = ''; + if ($STANDBY) { + $mode = 'STANDBY'; + } + if ($MASTER) { + $mode = 'MASTER'; + } + + ## If we have an assume flag, then honor it. + my $goodmode = 1; + if ($opt{'assume-standby-mode'} and not $STANDBY) { + $goodmode = 0; + $mode = 'NOT STANDBY'; + } + elsif ($opt{'assume-prod'} and not $MASTER) { + $goodmode = 0; + $mode = 'NOT MASTER'; + } + + if (length($mode) > 0) { + $db->{perf} .= sprintf ' %s=%s', + perfname(msg('mode')), $mode; + } + if ($MRTG) { do_mrtg({one => $diff, msg => $msg}); } - if (length $critical and $diff >= $critical) { + if ((length $critical and $diff >= $critical) or not $goodmode) { add_critical $msg; return; } @@ -3743,6 +3786,58 @@ sub check_checkpoint { } ## end of check_checkpoint +sub check_cluster_id { + + + ## Verify the Database System Identifier provided by pg_controldata + ## Supports: Nagios, MRTG + ## One of warning or critical must be given (but not both) + ## It should run one time to find out the expected cluster-id + ## You can use --critical="0" to find out the current cluster-id + ## You can include or exclude settings as well + ## Example: + ## check_postgres_cluster_id --critical="5633695740047915125" + + my ($warning, $critical) = validate_range({type => 'integer', onlyone => 1}); + + $db->{host} = '<none>'; + + ## Run pg_controldata, grab the cluster-id + $res = open_controldata(); + + my $regex = msg('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) + ## Just in case, check the English one as well + $regex = msg_en('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { + ndie msg('checkpoint-noregex'); + } + } + my $ident = $1; + + my $msg = msg('checkcluster-msg', $ident); + if ($MRTG) { + $opt{mrtg} or ndie msg('checksum-nomrtg'); + do_mrtg({one => $opt{mrtg} eq $ident ? 1 : 0, msg => $ident}); + } + if ($critical and $critical ne $ident) { + add_critical $msg; + } + elsif ($warning and $warning ne $ident) { + add_warning $msg; + } + elsif (!$critical and !$warning) { + add_unknown $msg; + } + else { + add_ok $msg; + } + + return; + +} ## end of check_cluster_id + + sub check_commitratio { ## Check the commitratio of one or more databases @@ -7814,6 +7909,16 @@ Example: postgres@db$./check_postgres.pl --action=version --warning=8.1 --datadir /var/lib/postgresql/8.3/main/ --assume-standby-mode POSTGRES_VERSION OK: Server in standby mode | time=0.00 +=item B<--assume-prod> + +If specified, check if server in production mode is performed (--datadir is required). +The option is only relevant for (C<symlink: check_postgres_checkpoint>). + +Example: + + postgres@db$./check_postgres.pl --action=checkpoint --datadir /var/lib/postgresql/8.3/main/ --assume-prod + POSTGRES_CHECKPOINT OK: Last checkpoint was 72 seconds ago | age=72;;300 mode=MASTER + =item B<-h> or B<--help> Displays a help screen with a summary of all actions and options. @@ -8084,6 +8189,8 @@ was run, as determined by parsing the call to C<pg_controldata>. Because of this pg_controldata executable must be available in the current path. Alternatively, you can set the environment variable C<PGCONTROLDATA> to the exact location of the pg_controldata executable, or you can specify C<PGBINDIR> as the directory that it lives in. +It is also possible to use the special options I<--assume-prod> or +I<--assume-standby-mode>, if the mode found is not the one expected, a CRITICAL is emitted. At least one warning or critical argument must be set. @@ -8091,6 +8198,27 @@ This action requires the Date::Parse module. For MRTG or simple output, returns the number of seconds. +=head2 B<cluster_id> + +(C<symlink: check_postgres_cluster-id>) Checks that the Database System Identifier +provided by pg_controldata is the same as last time you checked. This must run on the same +server as the database that is being checked (e.g. the -h flag will not work). +Either the I<--warning> or the I<--critical> option should be given, but not both. The value +of each one is the cluster identifier, an integer value. You can run with the special C<--critical=0> option +to find out an existing cluster identifier. + +Example 1: Find the initial identifier + + check_postgres_cluster_id --critical=0 --datadir=/var//lib/postgresql/9.0/main + +Example 2: Make sure the cluster is the same and warn if not, using the result from above. + + check_postgres_cluster_id --critical=5633695740047915135 + +For MRTG output, returns a 1 or 0 indicating success of failure of the identifier to match. A +identifier must be provided as the C<--mrtg> argument. The fourth line always gives the +current identifier. + =head2 B<commitratio> (C<symlink: check_postgres_commitratio>) Checks the commit ratio of all databases and complains when they are too low. @@ -9366,6 +9494,12 @@ Items not specifically attributed are by GSM (Greg Sabino Mullane). =over 4 +=item B<Version 2.19.0> + + Add the --assume-prod option (Cédric Villemain) + Add the cluster_id check (Cédric Villemain) + Improve settings_checksum and checkpoint tests (Cédric Villemain) + =item B<Version 2.18.1> Do no do an inner join to pg_user when checking database size |