@@ -75,6 +75,8 @@ typedef struct BtreeCheckState
75
75
bool readonly ;
76
76
/* Also verifying heap has no unindexed tuples? */
77
77
bool heapallindexed ;
78
+ /* Also making sure non-pivot tuples can be found by new search? */
79
+ bool rootdescend ;
78
80
/* Per-page context */
79
81
MemoryContext targetcontext ;
80
82
/* Buffer access strategy */
@@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
124
126
PG_FUNCTION_INFO_V1 (bt_index_parent_check );
125
127
126
128
static void bt_index_check_internal (Oid indrelid , bool parentcheck ,
127
- bool heapallindexed );
129
+ bool heapallindexed , bool rootdescend );
128
130
static inline void btree_index_checkable (Relation rel );
129
131
static void bt_check_every_level (Relation rel , Relation heaprel ,
130
- bool heapkeyspace , bool readonly , bool heapallindexed );
132
+ bool heapkeyspace , bool readonly , bool heapallindexed ,
133
+ bool rootdescend );
131
134
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
132
135
BtreeLevel level );
133
136
static void bt_target_page_check (BtreeCheckState * state );
@@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
140
143
bool tupleIsAlive , void * checkstate );
141
144
static IndexTuple bt_normalize_tuple (BtreeCheckState * state ,
142
145
IndexTuple itup );
146
+ static bool bt_rootdescend (BtreeCheckState * state , IndexTuple itup );
143
147
static inline bool offset_is_negative_infinity (BTPageOpaque opaque ,
144
148
OffsetNumber offset );
145
149
static inline bool invariant_l_offset (BtreeCheckState * state , BTScanInsert key ,
@@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
177
181
if (PG_NARGS () == 2 )
178
182
heapallindexed = PG_GETARG_BOOL (1 );
179
183
180
- bt_index_check_internal (indrelid , false, heapallindexed );
184
+ bt_index_check_internal (indrelid , false, heapallindexed , false );
181
185
182
186
PG_RETURN_VOID ();
183
187
}
@@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
196
200
{
197
201
Oid indrelid = PG_GETARG_OID (0 );
198
202
bool heapallindexed = false;
203
+ bool rootdescend = false;
199
204
200
- if (PG_NARGS () = = 2 )
205
+ if (PG_NARGS () > = 2 )
201
206
heapallindexed = PG_GETARG_BOOL (1 );
207
+ if (PG_NARGS () == 3 )
208
+ rootdescend = PG_GETARG_BOOL (2 );
202
209
203
- bt_index_check_internal (indrelid , true, heapallindexed );
210
+ bt_index_check_internal (indrelid , true, heapallindexed , rootdescend );
204
211
205
212
PG_RETURN_VOID ();
206
213
}
@@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
209
216
* Helper for bt_index_[parent_]check, coordinating the bulk of the work.
210
217
*/
211
218
static void
212
- bt_index_check_internal (Oid indrelid , bool parentcheck , bool heapallindexed )
219
+ bt_index_check_internal (Oid indrelid , bool parentcheck , bool heapallindexed ,
220
+ bool rootdescend )
213
221
{
214
222
Oid heapid ;
215
223
Relation indrel ;
@@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
267
275
/* Check index, possibly against table it is an index on */
268
276
heapkeyspace = _bt_heapkeyspace (indrel );
269
277
bt_check_every_level (indrel , heaprel , heapkeyspace , parentcheck ,
270
- heapallindexed );
278
+ heapallindexed , rootdescend );
271
279
272
280
/*
273
281
* Release locks early. That's ok here because nothing in the called
@@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
338
346
*/
339
347
static void
340
348
bt_check_every_level (Relation rel , Relation heaprel , bool heapkeyspace ,
341
- bool readonly , bool heapallindexed )
349
+ bool readonly , bool heapallindexed , bool rootdescend )
342
350
{
343
351
BtreeCheckState * state ;
344
352
Page metapage ;
@@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
362
370
state -> heapkeyspace = heapkeyspace ;
363
371
state -> readonly = readonly ;
364
372
state -> heapallindexed = heapallindexed ;
373
+ state -> rootdescend = rootdescend ;
365
374
366
375
if (state -> heapallindexed )
367
376
{
@@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
430
439
}
431
440
}
432
441
442
+ Assert (!state -> rootdescend || state -> readonly );
443
+ if (state -> rootdescend && !state -> heapkeyspace )
444
+ ereport (ERROR ,
445
+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
446
+ errmsg ("cannot verify that tuples from index \"%s\" can each be found by an independent index search" ,
447
+ RelationGetRelationName (rel )),
448
+ errhint ("Only B-Tree version 4 indexes support rootdescend verification." )));
449
+
433
450
/* Create context for page */
434
451
state -> targetcontext = AllocSetContextCreate (CurrentMemoryContext ,
435
452
"amcheck context" ,
@@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
922
939
if (offset_is_negative_infinity (topaque , offset ))
923
940
continue ;
924
941
942
+ /*
943
+ * Readonly callers may optionally verify that non-pivot tuples can
944
+ * each be found by an independent search that starts from the root
945
+ */
946
+ if (state -> rootdescend && P_ISLEAF (topaque ) &&
947
+ !bt_rootdescend (state , itup ))
948
+ {
949
+ char * itid ,
950
+ * htid ;
951
+
952
+ itid = psprintf ("(%u,%u)" , state -> targetblock , offset );
953
+ htid = psprintf ("(%u,%u)" ,
954
+ ItemPointerGetBlockNumber (& (itup -> t_tid )),
955
+ ItemPointerGetOffsetNumber (& (itup -> t_tid )));
956
+
957
+ ereport (ERROR ,
958
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
959
+ errmsg ("could not find tuple using search from root page in index \"%s\"" ,
960
+ RelationGetRelationName (state -> rel )),
961
+ errdetail_internal ("Index tid=%s points to heap tid=%s page lsn=%X/%X." ,
962
+ itid , htid ,
963
+ (uint32 ) (state -> targetlsn >> 32 ),
964
+ (uint32 ) state -> targetlsn )));
965
+ }
966
+
925
967
/* Build insertion scankey for current page offset */
926
968
skey = bt_mkscankey_pivotsearch (state -> rel , itup );
927
969
@@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
1526
1568
* internal pages. In more general terms, a negative infinity item is
1527
1569
* only negative infinity with respect to the subtree that the page is
1528
1570
* at the root of.
1571
+ *
1572
+ * See also: bt_rootdescend(), which can even detect transitive
1573
+ * inconsistencies on cousin leaf pages.
1529
1574
*/
1530
1575
if (offset_is_negative_infinity (copaque , offset ))
1531
1576
continue ;
@@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
1926
1971
return reformed ;
1927
1972
}
1928
1973
1974
+ /*
1975
+ * Search for itup in index, starting from fast root page. itup must be a
1976
+ * non-pivot tuple. This is only supported with heapkeyspace indexes, since
1977
+ * we rely on having fully unique keys to find a match with only a signle
1978
+ * visit to a leaf page, barring an interrupted page split, where we may have
1979
+ * to move right. (A concurrent page split is impossible because caller must
1980
+ * be readonly caller.)
1981
+ *
1982
+ * This routine can detect very subtle transitive consistency issues across
1983
+ * more than one level of the tree. Leaf pages all have a high key (even the
1984
+ * rightmost page has a conceptual positive infinity high key), but not a low
1985
+ * key. Their downlink in parent is a lower bound, which along with the high
1986
+ * key is almost enough to detect every possible inconsistency. A downlink
1987
+ * separator key value won't always be available from parent, though, because
1988
+ * the first items of internal pages are negative infinity items, truncated
1989
+ * down to zero attributes during internal page splits. While it's true that
1990
+ * bt_downlink_check() and the high key check can detect most imaginable key
1991
+ * space problems, there are remaining problems it won't detect with non-pivot
1992
+ * tuples in cousin leaf pages. Starting a search from the root for every
1993
+ * existing leaf tuple detects small inconsistencies in upper levels of the
1994
+ * tree that cannot be detected any other way. (Besides all this, this is
1995
+ * probably also useful as a direct test of the code used by index scans
1996
+ * themselves.)
1997
+ */
1998
+ static bool
1999
+ bt_rootdescend (BtreeCheckState * state , IndexTuple itup )
2000
+ {
2001
+ BTScanInsert key ;
2002
+ BTStack stack ;
2003
+ Buffer lbuf ;
2004
+ bool exists ;
2005
+
2006
+ key = _bt_mkscankey (state -> rel , itup );
2007
+ Assert (key -> heapkeyspace && key -> scantid != NULL );
2008
+
2009
+ /*
2010
+ * Search from root.
2011
+ *
2012
+ * Ideally, we would arrange to only move right within _bt_search() when
2013
+ * an interrupted page split is detected (i.e. when the incomplete split
2014
+ * bit is found to be set), but for now we accept the possibility that
2015
+ * that could conceal an inconsistency.
2016
+ */
2017
+ Assert (state -> readonly && state -> rootdescend );
2018
+ exists = false;
2019
+ stack = _bt_search (state -> rel , key , & lbuf , BT_READ , NULL );
2020
+
2021
+ if (BufferIsValid (lbuf ))
2022
+ {
2023
+ BTInsertStateData insertstate ;
2024
+ OffsetNumber offnum ;
2025
+ Page page ;
2026
+
2027
+ insertstate .itup = itup ;
2028
+ insertstate .itemsz = MAXALIGN (IndexTupleSize (itup ));
2029
+ insertstate .itup_key = key ;
2030
+ insertstate .bounds_valid = false;
2031
+ insertstate .buf = lbuf ;
2032
+
2033
+ /* Get matching tuple on leaf page */
2034
+ offnum = _bt_binsrch_insert (state -> rel , & insertstate );
2035
+ /* Compare first >= matching item on leaf page, if any */
2036
+ page = BufferGetPage (lbuf );
2037
+ if (offnum <= PageGetMaxOffsetNumber (page ) &&
2038
+ _bt_compare (state -> rel , key , page , offnum ) == 0 )
2039
+ exists = true;
2040
+ _bt_relbuf (state -> rel , lbuf );
2041
+ }
2042
+
2043
+ _bt_freestack (stack );
2044
+ pfree (key );
2045
+
2046
+ return exists ;
2047
+ }
2048
+
1929
2049
/*
1930
2050
* Is particular offset within page (whose special state is passed by caller)
1931
2051
* the page negative-infinity item?
0 commit comments