Skip to content

Commit c1afd17

Browse files
Allow amcheck to re-find tuples using new search.
Teach contrib/amcheck's bt_index_parent_check() function to take advantage of the uniqueness property of heapkeyspace indexes in support of a new verification option: non-pivot tuples (non-highkey tuples on the leaf level) can optionally be re-found using a new search for each, that starts from the root page. If a tuple cannot be re-found, report that the index is corrupt. The new "rootdescend" verification option is exhaustive, and can therefore make a call to bt_index_parent_check() take a lot longer. Re-finding tuples during verification is mostly intended as an option for backend developers, since the corruption scenarios that it alone is uniquely capable of detecting seem fairly far-fetched. For example, "rootdescend" verification is much more likely to detect corruption of the least significant byte of a key from a pivot tuple in the root page of a B-Tree that already has at least three levels. Typically, only a few tuples on a cousin leaf page are at risk of "getting overlooked" by index scans in this scenario. The corrupt key in the root page is only slightly corrupt: corrupt enough to give wrong answers to some queries, and yet not corrupt enough to allow the problem to be detected without verifying agreement between the leaf page and the root page, skipping at least one internal page level. The existing bt_index_parent_check() checks never cross more than a single level. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-Wz=yTWnVu+HeHGKb2AGiADL9eprn-cKYAto4MkKOuiGtRQ@mail.gmail.com
1 parent fab2502 commit c1afd17

File tree

7 files changed

+160
-16
lines changed

7 files changed

+160
-16
lines changed

contrib/amcheck/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ MODULE_big = amcheck
44
OBJS = verify_nbtree.o $(WIN32RES)
55

66
EXTENSION = amcheck
7-
DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql
7+
DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql
88
PGFILEDESC = "amcheck - function for verifying relation integrity"
99

1010
REGRESS = check check_btree

contrib/amcheck/amcheck--1.1--1.2.sql

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/* contrib/amcheck/amcheck--1.1--1.2.sql */
2+
3+
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
4+
\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit
5+
6+
-- In order to avoid issues with dependencies when updating amcheck to 1.2,
7+
-- create new, overloaded version of the 1.1 function signature
8+
9+
--
10+
-- bt_index_parent_check()
11+
--
12+
CREATE FUNCTION bt_index_parent_check(index regclass,
13+
heapallindexed boolean, rootdescend boolean)
14+
RETURNS VOID
15+
AS 'MODULE_PATHNAME', 'bt_index_parent_check'
16+
LANGUAGE C STRICT PARALLEL RESTRICTED;
17+
18+
-- Don't want this to be available to public
19+
REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC;

contrib/amcheck/amcheck.control

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# amcheck extension
22
comment = 'functions for verifying relation integrity'
3-
default_version = '1.1'
3+
default_version = '1.2'
44
module_pathname = '$libdir/amcheck'
55
relocatable = true

contrib/amcheck/expected/check_btree.out

+3-2
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true);
126126
(1 row)
127127

128128
--
129-
-- Test for multilevel page deletion/downlink present checks
129+
-- Test for multilevel page deletion/downlink present checks, and rootdescend
130+
-- checks
130131
--
131132
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
132133
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
@@ -137,7 +138,7 @@ VACUUM delete_test_table;
137138
-- root"
138139
DELETE FROM delete_test_table WHERE a < 79990;
139140
VACUUM delete_test_table;
140-
SELECT bt_index_parent_check('delete_test_table_pkey', true);
141+
SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
141142
bt_index_parent_check
142143
-----------------------
143144

contrib/amcheck/sql/check_btree.sql

+3-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
7878
SELECT bt_index_parent_check('bttest_multi_idx', true);
7979

8080
--
81-
-- Test for multilevel page deletion/downlink present checks
81+
-- Test for multilevel page deletion/downlink present checks, and rootdescend
82+
-- checks
8283
--
8384
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
8485
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
@@ -89,7 +90,7 @@ VACUUM delete_test_table;
8990
-- root"
9091
DELETE FROM delete_test_table WHERE a < 79990;
9192
VACUUM delete_test_table;
92-
SELECT bt_index_parent_check('delete_test_table_pkey', true);
93+
SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
9394

9495
--
9596
-- BUG #15597: must not assume consistent input toasting state when forming

contrib/amcheck/verify_nbtree.c

+128-8
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ typedef struct BtreeCheckState
7575
bool readonly;
7676
/* Also verifying heap has no unindexed tuples? */
7777
bool heapallindexed;
78+
/* Also making sure non-pivot tuples can be found by new search? */
79+
bool rootdescend;
7880
/* Per-page context */
7981
MemoryContext targetcontext;
8082
/* Buffer access strategy */
@@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
124126
PG_FUNCTION_INFO_V1(bt_index_parent_check);
125127

126128
static void bt_index_check_internal(Oid indrelid, bool parentcheck,
127-
bool heapallindexed);
129+
bool heapallindexed, bool rootdescend);
128130
static inline void btree_index_checkable(Relation rel);
129131
static void bt_check_every_level(Relation rel, Relation heaprel,
130-
bool heapkeyspace, bool readonly, bool heapallindexed);
132+
bool heapkeyspace, bool readonly, bool heapallindexed,
133+
bool rootdescend);
131134
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
132135
BtreeLevel level);
133136
static void bt_target_page_check(BtreeCheckState *state);
@@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
140143
bool tupleIsAlive, void *checkstate);
141144
static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
142145
IndexTuple itup);
146+
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
143147
static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
144148
OffsetNumber offset);
145149
static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
@@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
177181
if (PG_NARGS() == 2)
178182
heapallindexed = PG_GETARG_BOOL(1);
179183

180-
bt_index_check_internal(indrelid, false, heapallindexed);
184+
bt_index_check_internal(indrelid, false, heapallindexed, false);
181185

182186
PG_RETURN_VOID();
183187
}
@@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
196200
{
197201
Oid indrelid = PG_GETARG_OID(0);
198202
bool heapallindexed = false;
203+
bool rootdescend = false;
199204

200-
if (PG_NARGS() == 2)
205+
if (PG_NARGS() >= 2)
201206
heapallindexed = PG_GETARG_BOOL(1);
207+
if (PG_NARGS() == 3)
208+
rootdescend = PG_GETARG_BOOL(2);
202209

203-
bt_index_check_internal(indrelid, true, heapallindexed);
210+
bt_index_check_internal(indrelid, true, heapallindexed, rootdescend);
204211

205212
PG_RETURN_VOID();
206213
}
@@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
209216
* Helper for bt_index_[parent_]check, coordinating the bulk of the work.
210217
*/
211218
static void
212-
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
219+
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
220+
bool rootdescend)
213221
{
214222
Oid heapid;
215223
Relation indrel;
@@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
267275
/* Check index, possibly against table it is an index on */
268276
heapkeyspace = _bt_heapkeyspace(indrel);
269277
bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
270-
heapallindexed);
278+
heapallindexed, rootdescend);
271279

272280
/*
273281
* Release locks early. That's ok here because nothing in the called
@@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
338346
*/
339347
static void
340348
bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
341-
bool readonly, bool heapallindexed)
349+
bool readonly, bool heapallindexed, bool rootdescend)
342350
{
343351
BtreeCheckState *state;
344352
Page metapage;
@@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
362370
state->heapkeyspace = heapkeyspace;
363371
state->readonly = readonly;
364372
state->heapallindexed = heapallindexed;
373+
state->rootdescend = rootdescend;
365374

366375
if (state->heapallindexed)
367376
{
@@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
430439
}
431440
}
432441

442+
Assert(!state->rootdescend || state->readonly);
443+
if (state->rootdescend && !state->heapkeyspace)
444+
ereport(ERROR,
445+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
446+
errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
447+
RelationGetRelationName(rel)),
448+
errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
449+
433450
/* Create context for page */
434451
state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
435452
"amcheck context",
@@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
922939
if (offset_is_negative_infinity(topaque, offset))
923940
continue;
924941

942+
/*
943+
* Readonly callers may optionally verify that non-pivot tuples can
944+
* each be found by an independent search that starts from the root
945+
*/
946+
if (state->rootdescend && P_ISLEAF(topaque) &&
947+
!bt_rootdescend(state, itup))
948+
{
949+
char *itid,
950+
*htid;
951+
952+
itid = psprintf("(%u,%u)", state->targetblock, offset);
953+
htid = psprintf("(%u,%u)",
954+
ItemPointerGetBlockNumber(&(itup->t_tid)),
955+
ItemPointerGetOffsetNumber(&(itup->t_tid)));
956+
957+
ereport(ERROR,
958+
(errcode(ERRCODE_INDEX_CORRUPTED),
959+
errmsg("could not find tuple using search from root page in index \"%s\"",
960+
RelationGetRelationName(state->rel)),
961+
errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
962+
itid, htid,
963+
(uint32) (state->targetlsn >> 32),
964+
(uint32) state->targetlsn)));
965+
}
966+
925967
/* Build insertion scankey for current page offset */
926968
skey = bt_mkscankey_pivotsearch(state->rel, itup);
927969

@@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
15261568
* internal pages. In more general terms, a negative infinity item is
15271569
* only negative infinity with respect to the subtree that the page is
15281570
* at the root of.
1571+
*
1572+
* See also: bt_rootdescend(), which can even detect transitive
1573+
* inconsistencies on cousin leaf pages.
15291574
*/
15301575
if (offset_is_negative_infinity(copaque, offset))
15311576
continue;
@@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
19261971
return reformed;
19271972
}
19281973

1974+
/*
1975+
* Search for itup in index, starting from fast root page. itup must be a
1976+
* non-pivot tuple. This is only supported with heapkeyspace indexes, since
1977+
* we rely on having fully unique keys to find a match with only a signle
1978+
* visit to a leaf page, barring an interrupted page split, where we may have
1979+
* to move right. (A concurrent page split is impossible because caller must
1980+
* be readonly caller.)
1981+
*
1982+
* This routine can detect very subtle transitive consistency issues across
1983+
* more than one level of the tree. Leaf pages all have a high key (even the
1984+
* rightmost page has a conceptual positive infinity high key), but not a low
1985+
* key. Their downlink in parent is a lower bound, which along with the high
1986+
* key is almost enough to detect every possible inconsistency. A downlink
1987+
* separator key value won't always be available from parent, though, because
1988+
* the first items of internal pages are negative infinity items, truncated
1989+
* down to zero attributes during internal page splits. While it's true that
1990+
* bt_downlink_check() and the high key check can detect most imaginable key
1991+
* space problems, there are remaining problems it won't detect with non-pivot
1992+
* tuples in cousin leaf pages. Starting a search from the root for every
1993+
* existing leaf tuple detects small inconsistencies in upper levels of the
1994+
* tree that cannot be detected any other way. (Besides all this, this is
1995+
* probably also useful as a direct test of the code used by index scans
1996+
* themselves.)
1997+
*/
1998+
static bool
1999+
bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
2000+
{
2001+
BTScanInsert key;
2002+
BTStack stack;
2003+
Buffer lbuf;
2004+
bool exists;
2005+
2006+
key = _bt_mkscankey(state->rel, itup);
2007+
Assert(key->heapkeyspace && key->scantid != NULL);
2008+
2009+
/*
2010+
* Search from root.
2011+
*
2012+
* Ideally, we would arrange to only move right within _bt_search() when
2013+
* an interrupted page split is detected (i.e. when the incomplete split
2014+
* bit is found to be set), but for now we accept the possibility that
2015+
* that could conceal an inconsistency.
2016+
*/
2017+
Assert(state->readonly && state->rootdescend);
2018+
exists = false;
2019+
stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL);
2020+
2021+
if (BufferIsValid(lbuf))
2022+
{
2023+
BTInsertStateData insertstate;
2024+
OffsetNumber offnum;
2025+
Page page;
2026+
2027+
insertstate.itup = itup;
2028+
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
2029+
insertstate.itup_key = key;
2030+
insertstate.bounds_valid = false;
2031+
insertstate.buf = lbuf;
2032+
2033+
/* Get matching tuple on leaf page */
2034+
offnum = _bt_binsrch_insert(state->rel, &insertstate);
2035+
/* Compare first >= matching item on leaf page, if any */
2036+
page = BufferGetPage(lbuf);
2037+
if (offnum <= PageGetMaxOffsetNumber(page) &&
2038+
_bt_compare(state->rel, key, page, offnum) == 0)
2039+
exists = true;
2040+
_bt_relbuf(state->rel, lbuf);
2041+
}
2042+
2043+
_bt_freestack(stack);
2044+
pfree(key);
2045+
2046+
return exists;
2047+
}
2048+
19292049
/*
19302050
* Is particular offset within page (whose special state is passed by caller)
19312051
* the page negative-infinity item?

doc/src/sgml/amcheck.sgml

+5-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10;
112112

113113
<varlistentry>
114114
<term>
115-
<function>bt_index_parent_check(index regclass, heapallindexed boolean) returns void</function>
115+
<function>bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean) returns void</function>
116116
<indexterm>
117117
<primary>bt_index_parent_check</primary>
118118
</indexterm>
@@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10;
126126
argument is <literal>true</literal>, the function verifies the
127127
presence of all heap tuples that should be found within the
128128
index, and that there are no missing downlinks in the index
129-
structure. The checks that can be performed by
129+
structure. When the optional <parameter>rootdescend</parameter>
130+
argument is <literal>true</literal>, verification re-finds
131+
tuples on the leaf level by performing a new search from the
132+
root page for each tuple. The checks that can be performed by
130133
<function>bt_index_parent_check</function> are a superset of the
131134
checks that can be performed by <function>bt_index_check</function>.
132135
<function>bt_index_parent_check</function> can be thought of as

0 commit comments

Comments
 (0)