@@ -93,8 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
93
93
static void btvacuumscan (IndexVacuumInfo * info , IndexBulkDeleteResult * stats ,
94
94
IndexBulkDeleteCallback callback , void * callback_state ,
95
95
BTCycleId cycleid );
96
- static void btvacuumpage (BTVacState * vstate , BlockNumber blkno ,
97
- BlockNumber orig_blkno );
96
+ static void btvacuumpage (BTVacState * vstate , BlockNumber scanblkno );
98
97
static BTVacuumPosting btreevacuumposting (BTVacState * vstate ,
99
98
IndexTuple posting ,
100
99
OffsetNumber updatedoffset ,
@@ -959,7 +958,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
959
958
Relation rel = info -> index ;
960
959
BTVacState vstate ;
961
960
BlockNumber num_pages ;
962
- BlockNumber blkno ;
961
+ BlockNumber scanblkno ;
963
962
bool needLock ;
964
963
965
964
/*
@@ -1009,7 +1008,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1009
1008
*/
1010
1009
needLock = !RELATION_IS_LOCAL (rel );
1011
1010
1012
- blkno = BTREE_METAPAGE + 1 ;
1011
+ scanblkno = BTREE_METAPAGE + 1 ;
1013
1012
for (;;)
1014
1013
{
1015
1014
/* Get the current relation length */
@@ -1024,15 +1023,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1024
1023
num_pages );
1025
1024
1026
1025
/* Quit if we've scanned the whole relation */
1027
- if (blkno >= num_pages )
1026
+ if (scanblkno >= num_pages )
1028
1027
break ;
1029
1028
/* Iterate over pages, then loop back to recheck length */
1030
- for (; blkno < num_pages ; blkno ++ )
1029
+ for (; scanblkno < num_pages ; scanblkno ++ )
1031
1030
{
1032
- btvacuumpage (& vstate , blkno , blkno );
1031
+ btvacuumpage (& vstate , scanblkno );
1033
1032
if (info -> report_progress )
1034
1033
pgstat_progress_update_param (PROGRESS_SCAN_BLOCKS_DONE ,
1035
- blkno );
1034
+ scanblkno );
1036
1035
}
1037
1036
}
1038
1037
@@ -1076,31 +1075,33 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1076
1075
/*
1077
1076
* btvacuumpage --- VACUUM one page
1078
1077
*
1079
- * This processes a single page for btvacuumscan(). In some cases we
1080
- * must go back and re-examine previously-scanned pages; this routine
1081
- * recurses when necessary to handle that case.
1082
- *
1083
- * blkno is the page to process. orig_blkno is the highest block number
1084
- * reached by the outer btvacuumscan loop (the same as blkno, unless we
1085
- * are recursing to re-examine a previous page).
1078
+ * This processes a single page for btvacuumscan(). In some cases we must
1079
+ * backtrack to re-examine and VACUUM pages that were the scanblkno during
1080
+ * a previous call here. This is how we handle page splits (that happened
1081
+ * after our cycleid was acquired) whose right half page happened to reuse
1082
+ * a block that we might have processed at some point before it was
1083
+ * recycled (i.e. before the page split).
1086
1084
*/
1087
1085
static void
1088
- btvacuumpage (BTVacState * vstate , BlockNumber blkno , BlockNumber orig_blkno )
1086
+ btvacuumpage (BTVacState * vstate , BlockNumber scanblkno )
1089
1087
{
1090
1088
IndexVacuumInfo * info = vstate -> info ;
1091
1089
IndexBulkDeleteResult * stats = vstate -> stats ;
1092
1090
IndexBulkDeleteCallback callback = vstate -> callback ;
1093
1091
void * callback_state = vstate -> callback_state ;
1094
1092
Relation rel = info -> index ;
1095
- bool delete_now ;
1096
- BlockNumber recurse_to ;
1093
+ bool attempt_pagedel ;
1094
+ BlockNumber blkno , backtrack_to ;
1097
1095
Buffer buf ;
1098
1096
Page page ;
1099
- BTPageOpaque opaque = NULL ;
1097
+ BTPageOpaque opaque ;
1098
+
1099
+ blkno = scanblkno ;
1100
+
1101
+ backtrack :
1100
1102
1101
- restart :
1102
- delete_now = false;
1103
- recurse_to = P_NONE ;
1103
+ attempt_pagedel = false;
1104
+ backtrack_to = P_NONE ;
1104
1105
1105
1106
/* call vacuum_delay_point while not holding any buffer lock */
1106
1107
vacuum_delay_point ();
@@ -1115,24 +1116,59 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1115
1116
info -> strategy );
1116
1117
LockBuffer (buf , BT_READ );
1117
1118
page = BufferGetPage (buf );
1119
+ opaque = NULL ;
1118
1120
if (!PageIsNew (page ))
1119
1121
{
1120
1122
_bt_checkpage (rel , buf );
1121
1123
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1122
1124
}
1123
1125
1124
- /*
1125
- * If we are recursing, the only case we want to do anything with is a
1126
- * live leaf page having the current vacuum cycle ID. Any other state
1127
- * implies we already saw the page (eg, deleted it as being empty).
1128
- */
1129
- if (blkno != orig_blkno )
1126
+ Assert (blkno <= scanblkno );
1127
+ if (blkno != scanblkno )
1130
1128
{
1131
- if (_bt_page_recyclable (page ) ||
1132
- P_IGNORE (opaque ) ||
1133
- !P_ISLEAF (opaque ) ||
1134
- opaque -> btpo_cycleid != vstate -> cycleid )
1129
+ /*
1130
+ * We're backtracking.
1131
+ *
1132
+ * We followed a right link to a sibling leaf page (a page that
1133
+ * happens to be from a block located before scanblkno). The only
1134
+ * case we want to do anything with is a live leaf page having the
1135
+ * current vacuum cycle ID.
1136
+ *
1137
+ * The page had better be in a state that's consistent with what we
1138
+ * expect. Check for conditions that imply corruption in passing. It
1139
+ * can't be half-dead because only an interrupted VACUUM process can
1140
+ * leave pages in that state, so we'd definitely have dealt with it
1141
+ * back when the page was the scanblkno page (half-dead pages are
1142
+ * always marked fully deleted by _bt_pagedel()). This assumes that
1143
+ * there can be only one vacuum process running at a time.
1144
+ */
1145
+ if (!opaque || !P_ISLEAF (opaque ) || P_ISHALFDEAD (opaque ))
1135
1146
{
1147
+ Assert (false);
1148
+ ereport (LOG ,
1149
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1150
+ errmsg_internal ("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"" ,
1151
+ blkno , scanblkno , RelationGetRelationName (rel ))));
1152
+ _bt_relbuf (rel , buf );
1153
+ return ;
1154
+ }
1155
+
1156
+ /*
1157
+ * We may have already processed the page in an earlier call, when the
1158
+ * page was scanblkno. This happens when the leaf page split occurred
1159
+ * after the scan began, but before the right sibling page became the
1160
+ * scanblkno.
1161
+ *
1162
+ * Page may also have been deleted by current btvacuumpage() call,
1163
+ * since _bt_pagedel() sometimes deletes the right sibling page of
1164
+ * scanblkno in passing (it does so after we decided where to
1165
+ * backtrack to). We don't need to process this page as a deleted
1166
+ * page a second time now (in fact, it would be wrong to count it as a
1167
+ * deleted page in the bulk delete statistics a second time).
1168
+ */
1169
+ if (opaque -> btpo_cycleid != vstate -> cycleid || P_ISDELETED (opaque ))
1170
+ {
1171
+ /* Done with current scanblkno (and all lower split pages) */
1136
1172
_bt_relbuf (rel , buf );
1137
1173
return ;
1138
1174
}
@@ -1165,7 +1201,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1165
1201
* Half-dead leaf page. Try to delete now. Might update
1166
1202
* oldestBtpoXact and pages_deleted below.
1167
1203
*/
1168
- delete_now = true;
1204
+ attempt_pagedel = true;
1169
1205
}
1170
1206
else if (P_ISLEAF (opaque ))
1171
1207
{
@@ -1189,18 +1225,20 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1189
1225
LockBufferForCleanup (buf );
1190
1226
1191
1227
/*
1192
- * Check whether we need to recurse back to earlier pages. What we
1193
- * are concerned about is a page split that happened since we started
1194
- * the vacuum scan. If the split moved some tuples to a lower page
1195
- * then we might have missed 'em. If so, set up for tail recursion.
1196
- * (Must do this before possibly clearing btpo_cycleid below!)
1228
+ * Check whether we need to backtrack to earlier pages. What we are
1229
+ * concerned about is a page split that happened since we started the
1230
+ * vacuum scan. If the split moved tuples on the right half of the
1231
+ * split (i.e. the tuples that sort high) to a block that we already
1232
+ * passed over, then we might have missed the tuples. We need to
1233
+ * backtrack now. (Must do this before possibly clearing btpo_cycleid
1234
+ * or deleting scanblkno page below!)
1197
1235
*/
1198
1236
if (vstate -> cycleid != 0 &&
1199
1237
opaque -> btpo_cycleid == vstate -> cycleid &&
1200
1238
!(opaque -> btpo_flags & BTP_SPLIT_END ) &&
1201
1239
!P_RIGHTMOST (opaque ) &&
1202
- opaque -> btpo_next < orig_blkno )
1203
- recurse_to = opaque -> btpo_next ;
1240
+ opaque -> btpo_next < scanblkno )
1241
+ backtrack_to = opaque -> btpo_next ;
1204
1242
1205
1243
/*
1206
1244
* When each VACUUM begins, it determines an OldestXmin cutoff value.
@@ -1311,7 +1349,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1311
1349
*/
1312
1350
if (ndeletable > 0 || nupdatable > 0 )
1313
1351
{
1314
- Assert (nhtidsdead >= Max ( ndeletable , 1 ) );
1352
+ Assert (nhtidsdead >= ndeletable + nupdatable );
1315
1353
_bt_delitems_vacuum (rel , buf , deletable , ndeletable , updatable ,
1316
1354
nupdatable );
1317
1355
@@ -1347,19 +1385,19 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1347
1385
/*
1348
1386
* If the leaf page is now empty, try to delete it; else count the
1349
1387
* live tuples (live table TIDs in posting lists are counted as
1350
- * separate live tuples). We don't delete when recursing , though, to
1351
- * avoid putting entries into freePages out-of-order (doesn't seem
1352
- * worth any extra code to handle the case ).
1388
+ * separate live tuples). We don't delete when backtracking , though,
1389
+ * since that would require teaching _bt_pagedel() about backtracking
1390
+ * (doesn't seem worth adding more complexity to deal with that ).
1353
1391
*/
1354
1392
if (minoff > maxoff )
1355
- delete_now = (blkno == orig_blkno );
1393
+ attempt_pagedel = (blkno == scanblkno );
1356
1394
else
1357
1395
stats -> num_index_tuples += nhtidslive ;
1358
1396
1359
- Assert (!delete_now || nhtidslive == 0 );
1397
+ Assert (!attempt_pagedel || nhtidslive == 0 );
1360
1398
}
1361
1399
1362
- if (delete_now )
1400
+ if (attempt_pagedel )
1363
1401
{
1364
1402
MemoryContext oldcontext ;
1365
1403
@@ -1372,6 +1410,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1372
1410
* any page that a future call here from btvacuumscan is expected to
1373
1411
* count. There will be no double-counting.
1374
1412
*/
1413
+ Assert (blkno == scanblkno );
1375
1414
stats -> pages_deleted += _bt_pagedel (rel , buf , & vstate -> oldestBtpoXact );
1376
1415
1377
1416
MemoryContextSwitchTo (oldcontext );
@@ -1380,18 +1419,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1380
1419
else
1381
1420
_bt_relbuf (rel , buf );
1382
1421
1383
- /*
1384
- * This is really tail recursion, but if the compiler is too stupid to
1385
- * optimize it as such, we'd eat an uncomfortably large amount of stack
1386
- * space per recursion level (due to the arrays used to track details of
1387
- * deletable/updatable items). A failure is improbable since the number
1388
- * of levels isn't likely to be large ... but just in case, let's
1389
- * hand-optimize into a loop.
1390
- */
1391
- if (recurse_to != P_NONE )
1422
+ if (backtrack_to != P_NONE )
1392
1423
{
1393
- blkno = recurse_to ;
1394
- goto restart ;
1424
+ blkno = backtrack_to ;
1425
+ goto backtrack ;
1395
1426
}
1396
1427
}
1397
1428
0 commit comments