Skip to content

Commit 9dc7251

Browse files
Refactor btvacuumpage().
Remove one of the arguments to btvacuumpage(), and give up on the idea that it's a recursive function. We now use the term "backtracking" to refer to the case where an earlier block must be visited to make sure no tuples that need to be removed were missed. Advertising btvacuumpage() as a recursive function was unhelpful. In reality the function always simulates recursion with a loop (it doesn't actually call itself). This wasn't just necessary as a precaution (per the comments mentioning tail recursion), though. There is no reliable natural limit on the number of times we can backtrack. There are important behavioral difference when "recursing"/backtracking, mostly related to page deletion. We don't perform page deletion when backtracking due to the extra complexity. And when we recurse, we're not performing a physical order scan anymore, so we expect fairly different conditions to hold for the page. Structuring the code like this makes it clearer how _bt_pagedel() cooperates with btvacuumpage() and btvacuumscan() (as established in commit b0229f2 and commit 73a076b). Author: Peter Geoghegan Reviewed-By: Masahiko Sawada Discussion: https://postgr.es/m/CAH2-WzmRGMDWiLMcb+zagG9652PboNN4Gfcq1Gc_wJL6A716MA@mail.gmail.com
1 parent b68a560 commit 9dc7251

File tree

2 files changed

+92
-58
lines changed

2 files changed

+92
-58
lines changed

src/backend/access/nbtree/nbtinsert.c

+3
Original file line numberDiff line numberDiff line change
@@ -1703,6 +1703,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
17031703
* Finish off remaining leftpage special area fields. They cannot be set
17041704
* before both origpage (leftpage) and rightpage buffers are acquired and
17051705
* locked.
1706+
*
1707+
* btpo_cycleid is only used with leaf pages, though we set it here in all
1708+
* cases just to be consistent.
17061709
*/
17071710
lopaque->btpo_next = rightpagenumber;
17081711
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);

src/backend/access/nbtree/nbtree.c

+89-58
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
9393
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
9494
IndexBulkDeleteCallback callback, void *callback_state,
9595
BTCycleId cycleid);
96-
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
97-
BlockNumber orig_blkno);
96+
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
9897
static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
9998
IndexTuple posting,
10099
OffsetNumber updatedoffset,
@@ -959,7 +958,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
959958
Relation rel = info->index;
960959
BTVacState vstate;
961960
BlockNumber num_pages;
962-
BlockNumber blkno;
961+
BlockNumber scanblkno;
963962
bool needLock;
964963

965964
/*
@@ -1009,7 +1008,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10091008
*/
10101009
needLock = !RELATION_IS_LOCAL(rel);
10111010

1012-
blkno = BTREE_METAPAGE + 1;
1011+
scanblkno = BTREE_METAPAGE + 1;
10131012
for (;;)
10141013
{
10151014
/* Get the current relation length */
@@ -1024,15 +1023,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10241023
num_pages);
10251024

10261025
/* Quit if we've scanned the whole relation */
1027-
if (blkno >= num_pages)
1026+
if (scanblkno >= num_pages)
10281027
break;
10291028
/* Iterate over pages, then loop back to recheck length */
1030-
for (; blkno < num_pages; blkno++)
1029+
for (; scanblkno < num_pages; scanblkno++)
10311030
{
1032-
btvacuumpage(&vstate, blkno, blkno);
1031+
btvacuumpage(&vstate, scanblkno);
10331032
if (info->report_progress)
10341033
pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1035-
blkno);
1034+
scanblkno);
10361035
}
10371036
}
10381037

@@ -1076,31 +1075,33 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10761075
/*
10771076
* btvacuumpage --- VACUUM one page
10781077
*
1079-
* This processes a single page for btvacuumscan(). In some cases we
1080-
* must go back and re-examine previously-scanned pages; this routine
1081-
* recurses when necessary to handle that case.
1082-
*
1083-
* blkno is the page to process. orig_blkno is the highest block number
1084-
* reached by the outer btvacuumscan loop (the same as blkno, unless we
1085-
* are recursing to re-examine a previous page).
1078+
* This processes a single page for btvacuumscan(). In some cases we must
1079+
* backtrack to re-examine and VACUUM pages that were the scanblkno during
1080+
* a previous call here. This is how we handle page splits (that happened
1081+
* after our cycleid was acquired) whose right half page happened to reuse
1082+
* a block that we might have processed at some point before it was
1083+
* recycled (i.e. before the page split).
10861084
*/
10871085
static void
1088-
btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1086+
btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
10891087
{
10901088
IndexVacuumInfo *info = vstate->info;
10911089
IndexBulkDeleteResult *stats = vstate->stats;
10921090
IndexBulkDeleteCallback callback = vstate->callback;
10931091
void *callback_state = vstate->callback_state;
10941092
Relation rel = info->index;
1095-
bool delete_now;
1096-
BlockNumber recurse_to;
1093+
bool attempt_pagedel;
1094+
BlockNumber blkno, backtrack_to;
10971095
Buffer buf;
10981096
Page page;
1099-
BTPageOpaque opaque = NULL;
1097+
BTPageOpaque opaque;
1098+
1099+
blkno = scanblkno;
1100+
1101+
backtrack:
11001102

1101-
restart:
1102-
delete_now = false;
1103-
recurse_to = P_NONE;
1103+
attempt_pagedel = false;
1104+
backtrack_to = P_NONE;
11041105

11051106
/* call vacuum_delay_point while not holding any buffer lock */
11061107
vacuum_delay_point();
@@ -1115,24 +1116,59 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11151116
info->strategy);
11161117
LockBuffer(buf, BT_READ);
11171118
page = BufferGetPage(buf);
1119+
opaque = NULL;
11181120
if (!PageIsNew(page))
11191121
{
11201122
_bt_checkpage(rel, buf);
11211123
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
11221124
}
11231125

1124-
/*
1125-
* If we are recursing, the only case we want to do anything with is a
1126-
* live leaf page having the current vacuum cycle ID. Any other state
1127-
* implies we already saw the page (eg, deleted it as being empty).
1128-
*/
1129-
if (blkno != orig_blkno)
1126+
Assert(blkno <= scanblkno);
1127+
if (blkno != scanblkno)
11301128
{
1131-
if (_bt_page_recyclable(page) ||
1132-
P_IGNORE(opaque) ||
1133-
!P_ISLEAF(opaque) ||
1134-
opaque->btpo_cycleid != vstate->cycleid)
1129+
/*
1130+
* We're backtracking.
1131+
*
1132+
* We followed a right link to a sibling leaf page (a page that
1133+
* happens to be from a block located before scanblkno). The only
1134+
* case we want to do anything with is a live leaf page having the
1135+
* current vacuum cycle ID.
1136+
*
1137+
* The page had better be in a state that's consistent with what we
1138+
* expect. Check for conditions that imply corruption in passing. It
1139+
* can't be half-dead because only an interrupted VACUUM process can
1140+
* leave pages in that state, so we'd definitely have dealt with it
1141+
* back when the page was the scanblkno page (half-dead pages are
1142+
* always marked fully deleted by _bt_pagedel()). This assumes that
1143+
* there can be only one vacuum process running at a time.
1144+
*/
1145+
if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
11351146
{
1147+
Assert(false);
1148+
ereport(LOG,
1149+
(errcode(ERRCODE_INDEX_CORRUPTED),
1150+
errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1151+
blkno, scanblkno, RelationGetRelationName(rel))));
1152+
_bt_relbuf(rel, buf);
1153+
return;
1154+
}
1155+
1156+
/*
1157+
* We may have already processed the page in an earlier call, when the
1158+
* page was scanblkno. This happens when the leaf page split occurred
1159+
* after the scan began, but before the right sibling page became the
1160+
* scanblkno.
1161+
*
1162+
* Page may also have been deleted by current btvacuumpage() call,
1163+
* since _bt_pagedel() sometimes deletes the right sibling page of
1164+
* scanblkno in passing (it does so after we decided where to
1165+
* backtrack to). We don't need to process this page as a deleted
1166+
* page a second time now (in fact, it would be wrong to count it as a
1167+
* deleted page in the bulk delete statistics a second time).
1168+
*/
1169+
if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1170+
{
1171+
/* Done with current scanblkno (and all lower split pages) */
11361172
_bt_relbuf(rel, buf);
11371173
return;
11381174
}
@@ -1165,7 +1201,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11651201
* Half-dead leaf page. Try to delete now. Might update
11661202
* oldestBtpoXact and pages_deleted below.
11671203
*/
1168-
delete_now = true;
1204+
attempt_pagedel = true;
11691205
}
11701206
else if (P_ISLEAF(opaque))
11711207
{
@@ -1189,18 +1225,20 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11891225
LockBufferForCleanup(buf);
11901226

11911227
/*
1192-
* Check whether we need to recurse back to earlier pages. What we
1193-
* are concerned about is a page split that happened since we started
1194-
* the vacuum scan. If the split moved some tuples to a lower page
1195-
* then we might have missed 'em. If so, set up for tail recursion.
1196-
* (Must do this before possibly clearing btpo_cycleid below!)
1228+
* Check whether we need to backtrack to earlier pages. What we are
1229+
* concerned about is a page split that happened since we started the
1230+
* vacuum scan. If the split moved tuples on the right half of the
1231+
* split (i.e. the tuples that sort high) to a block that we already
1232+
* passed over, then we might have missed the tuples. We need to
1233+
* backtrack now. (Must do this before possibly clearing btpo_cycleid
1234+
* or deleting scanblkno page below!)
11971235
*/
11981236
if (vstate->cycleid != 0 &&
11991237
opaque->btpo_cycleid == vstate->cycleid &&
12001238
!(opaque->btpo_flags & BTP_SPLIT_END) &&
12011239
!P_RIGHTMOST(opaque) &&
1202-
opaque->btpo_next < orig_blkno)
1203-
recurse_to = opaque->btpo_next;
1240+
opaque->btpo_next < scanblkno)
1241+
backtrack_to = opaque->btpo_next;
12041242

12051243
/*
12061244
* When each VACUUM begins, it determines an OldestXmin cutoff value.
@@ -1311,7 +1349,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13111349
*/
13121350
if (ndeletable > 0 || nupdatable > 0)
13131351
{
1314-
Assert(nhtidsdead >= Max(ndeletable, 1));
1352+
Assert(nhtidsdead >= ndeletable + nupdatable);
13151353
_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
13161354
nupdatable);
13171355

@@ -1347,19 +1385,19 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13471385
/*
13481386
* If the leaf page is now empty, try to delete it; else count the
13491387
* live tuples (live table TIDs in posting lists are counted as
1350-
* separate live tuples). We don't delete when recursing, though, to
1351-
* avoid putting entries into freePages out-of-order (doesn't seem
1352-
* worth any extra code to handle the case).
1388+
* separate live tuples). We don't delete when backtracking, though,
1389+
* since that would require teaching _bt_pagedel() about backtracking
1390+
* (doesn't seem worth adding more complexity to deal with that).
13531391
*/
13541392
if (minoff > maxoff)
1355-
delete_now = (blkno == orig_blkno);
1393+
attempt_pagedel = (blkno == scanblkno);
13561394
else
13571395
stats->num_index_tuples += nhtidslive;
13581396

1359-
Assert(!delete_now || nhtidslive == 0);
1397+
Assert(!attempt_pagedel || nhtidslive == 0);
13601398
}
13611399

1362-
if (delete_now)
1400+
if (attempt_pagedel)
13631401
{
13641402
MemoryContext oldcontext;
13651403

@@ -1372,6 +1410,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13721410
* any page that a future call here from btvacuumscan is expected to
13731411
* count. There will be no double-counting.
13741412
*/
1413+
Assert(blkno == scanblkno);
13751414
stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
13761415

13771416
MemoryContextSwitchTo(oldcontext);
@@ -1380,18 +1419,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13801419
else
13811420
_bt_relbuf(rel, buf);
13821421

1383-
/*
1384-
* This is really tail recursion, but if the compiler is too stupid to
1385-
* optimize it as such, we'd eat an uncomfortably large amount of stack
1386-
* space per recursion level (due to the arrays used to track details of
1387-
* deletable/updatable items). A failure is improbable since the number
1388-
* of levels isn't likely to be large ... but just in case, let's
1389-
* hand-optimize into a loop.
1390-
*/
1391-
if (recurse_to != P_NONE)
1422+
if (backtrack_to != P_NONE)
13921423
{
1393-
blkno = recurse_to;
1394-
goto restart;
1424+
blkno = backtrack_to;
1425+
goto backtrack;
13951426
}
13961427
}
13971428

0 commit comments

Comments
 (0)