summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Misch2023-10-30 21:46:05 +0000
committerNoah Misch2023-10-30 21:46:05 +0000
commit6ec9e9975eff5260651d84ed8bcfd8ecb08e5fbb (patch)
tree1b1bd6faa745f7fbfbcf735f45a1bbdaca771e3c
parent56b30e266eaa74f38bdda45067c9a5a63cd24c75 (diff)
amcheck: Distinguish interrupted page deletion from corruption.
This prevents false-positive reports about "the first child of leftmost target page is not leftmost of its level", "block %u is not leftmost" and "left link/right link pair". They appeared if amcheck ran before VACUUM cleaned things, after a cluster exited recovery between the first-stage and second-stage WAL records of a deletion. Back-patch to v11 (all supported versions). Reviewed by Peter Geoghegan. Discussion: https://postgr.es/m/[email protected]
-rw-r--r--contrib/amcheck/Makefile1
-rw-r--r--contrib/amcheck/meson.build1
-rw-r--r--contrib/amcheck/t/005_pitr.pl82
-rw-r--r--contrib/amcheck/verify_nbtree.c83
4 files changed, 163 insertions, 4 deletions
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index 88271687a3..5e9002d250 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -12,6 +12,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
REGRESS = check check_btree check_heap
+EXTRA_INSTALL = contrib/pg_walinspect
TAP_TESTS = 1
ifdef USE_PGXS
diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build
index 4c8e2e2f13..656a5ed90b 100644
--- a/contrib/amcheck/meson.build
+++ b/contrib/amcheck/meson.build
@@ -44,6 +44,7 @@ tests += {
't/002_cic.pl',
't/003_cic_2pc.pl',
't/004_verify_nbtree_unique.pl',
+ 't/005_pitr.pl',
],
},
}
diff --git a/contrib/amcheck/t/005_pitr.pl b/contrib/amcheck/t/005_pitr.pl
new file mode 100644
index 0000000000..6bcc1596f2
--- /dev/null
+++ b/contrib/amcheck/t/005_pitr.pl
@@ -0,0 +1,82 @@
+# Copyright (c) 2021-2023, PostgreSQL Global Development Group
+
+# Test integrity of intermediate states by PITR to those states
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# origin node: generate WAL records of interest.
+my $origin = PostgreSQL::Test::Cluster->new('origin');
+$origin->init(has_archiving => 1, allows_streaming => 1);
+$origin->append_conf('postgresql.conf', 'autovacuum = off');
+$origin->start;
+$origin->backup('my_backup');
+# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the
+# first four, so one index leaf is eligible for deletion. Make a replication
+# slot just so pg_walinspect will always have access to later WAL.
+my $setup = <<EOSQL;
+BEGIN;
+CREATE EXTENSION amcheck;
+CREATE EXTENSION pg_walinspect;
+CREATE TABLE not_leftmost (c text STORAGE PLAIN);
+INSERT INTO not_leftmost
+ SELECT repeat(n::text, database_block_size / 4)
+ FROM generate_series(1,6) t(n), pg_control_init();
+ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c);
+DELETE FROM not_leftmost WHERE c ~ '^[1-4]';
+SELECT pg_create_physical_replication_slot('for_walinspect', true, false);
+COMMIT;
+EOSQL
+$origin->safe_psql('postgres', $setup);
+my $before_vacuum_lsn =
+ $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by
+# dropping a permanent table. That way, the XLogReader infrastructure can
+# always see VACUUM's records, even under synchronous_commit=off. Finally,
+# find the LSN of that VACUUM's last UNLINK_PAGE record.
+my $vacuum = <<EOSQL;
+SET synchronous_commit = off;
+VACUUM (VERBOSE, INDEX_CLEANUP ON) not_leftmost;
+CREATE TABLE XLogFlush ();
+DROP TABLE XLogFlush;
+SELECT max(start_lsn)
+ FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF')
+ WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE';
+EOSQL
+my $unlink_lsn = $origin->safe_psql('postgres', $vacuum);
+$origin->stop;
+die "did not find UNLINK_PAGE record" unless $unlink_lsn;
+
+# replica node: amcheck at notable points in the WAL stream
+my $replica = PostgreSQL::Test::Cluster->new('replica');
+$replica->init_from_backup($origin, 'my_backup', has_restoring => 1);
+$replica->append_conf('postgresql.conf',
+ "recovery_target_lsn = '$unlink_lsn'");
+$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off');
+$replica->append_conf('postgresql.conf', 'recovery_target_action = promote');
+$replica->start;
+$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
+ or die "Timed out while waiting for PITR promotion";
+# recovery done; run amcheck
+my $debug = "SET client_min_messages = 'debug1'";
+my ($rc, $stderr);
+$rc = $replica->psql(
+ 'postgres',
+ "$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)",
+ stderr => \$stderr);
+print STDERR $stderr, "\n";
+is($rc, 0, "bt_index_parent_check passes");
+like(
+ $stderr,
+ qr/interrupted page deletion detected/,
+ "bt_index_parent_check: interrupted page deletion detected");
+$rc = $replica->psql(
+ 'postgres',
+ "$debug; SELECT bt_index_check('not_leftmost_pk', true)",
+ stderr => \$stderr);
+print STDERR $stderr, "\n";
+is($rc, 0, "bt_index_check passes");
+
+done_testing();
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 877452f38c..bcff849aa9 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -157,6 +157,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
bool rootdescend, bool checkunique);
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level);
+static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
+ BlockNumber start,
+ BTPageOpaque start_opaque);
static void bt_recheck_sibling_links(BtreeCheckState *state,
BlockNumber btpo_prev_from_target,
BlockNumber leftcurrent);
@@ -826,7 +829,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
*/
if (state->readonly)
{
- if (!P_LEFTMOST(opaque))
+ if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block %u is not leftmost in index \"%s\"",
@@ -880,8 +883,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
*/
}
- /* Sibling links should be in mutual agreement */
- if (opaque->btpo_prev != leftcurrent)
+ /*
+ * Sibling links should be in mutual agreement. There arises
+ * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
+ * of the parent's low-key downlink is half-dead. (A half-dead page
+ * has no downlink from its parent.) Under heavyweight locking, the
+ * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
+ * Without heavyweight locking, validation of the P_NONE case remains
+ * unimplemented.
+ */
+ if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
/* Check level */
@@ -1118,6 +1129,66 @@ bt_entry_unique_check(BtreeCheckState *state, IndexTuple itup,
}
/*
+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
+ * half-dead, sibling-linked pages to the left. If a half-dead page appears
+ * under state->readonly, the database exited recovery between the first-stage
+ * and second-stage WAL records of a deletion.
+ */
+static bool
+bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
+ BlockNumber start,
+ BTPageOpaque start_opaque)
+{
+ BlockNumber reached = start_opaque->btpo_prev,
+ reached_from = start;
+ bool all_half_dead = true;
+
+ /*
+ * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
+ * potentially observe nbtree/README "Page deletion and backwards scans".
+ */
+ Assert(state->readonly);
+
+ while (reached != P_NONE && all_half_dead)
+ {
+ Page page = palloc_btree_page(state, reached);
+ BTPageOpaque reached_opaque = BTPageGetOpaque(page);
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
+ * writes that side-links will continue to point to the siblings.
+ * Check btpo_next for that property.
+ */
+ all_half_dead = P_ISHALFDEAD(reached_opaque) &&
+ reached != start &&
+ reached != reached_from &&
+ reached_opaque->btpo_next == reached_from;
+ if (all_half_dead)
+ {
+ XLogRecPtr pagelsn = PageGetLSN(page);
+
+ /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
+ ereport(DEBUG1,
+ (errcode(ERRCODE_NO_DATA),
+ errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Block=%u right block=%u page lsn=%X/%X.",
+ reached, reached_from,
+ LSN_FORMAT_ARGS(pagelsn))));
+
+ reached_from = reached;
+ reached = reached_opaque->btpo_prev;
+ }
+
+ pfree(page);
+ }
+
+ return all_half_dead;
+}
+
+/*
* Raise an error when target page's left link does not point back to the
* previous target page, called leftcurrent here. The leftcurrent page's
* right link was followed to get to the current target page, and we expect
@@ -1157,6 +1228,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
BlockNumber btpo_prev_from_target,
BlockNumber leftcurrent)
{
+ /* passing metapage to BTPageGetOpaque() would give irrelevant findings */
+ Assert(leftcurrent != P_NONE);
+
if (!state->readonly)
{
Buffer lbuf;
@@ -2235,7 +2309,8 @@ bt_child_highkey_check(BtreeCheckState *state,
opaque = BTPageGetOpaque(page);
/* The first page we visit at the level should be leftmost */
- if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
+ if (first && !BlockNumberIsValid(state->prevrightlink) &&
+ !bt_leftmost_ignoring_half_dead(state, blkno, opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",