From 1bd4bc85cac2b23484a6b568a752de0351c2cc5b Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 18 Oct 2024 11:25:32 -0400 Subject: [PATCH] Optimize nbtree backwards scans. Make nbtree backwards scans optimistically access the next page to be read to the left by following a prevPage block number that's now stashed in currPos when the leaf page is first read. This approach matches the one taken during forward scans, which follow a symmetric nextPage block number from currPos. We stash both a prevPage and a nextPage, since the scan direction might change (when fetching from a scrollable cursor). Backwards scans will no longer need to lock the same page twice, except in rare cases where the scan detects a concurrent page split (or page deletion). Testing has shown this optimization to be particularly effective during parallel index-only backwards scans: ~12% reductions in query execution time are quite possible. We're much better off being optimistic; concurrent left sibling page splits are rare in general. It's possible that we'll need to lock more pages than the pessimistic approach would have, but only when there are _multiple_ concurrent splits of the left sibling page we now start at. If there's just a single concurrent left sibling page split, the new approach to scanning backwards will at least break even relative to the old one (we'll acquire the same number of leaf page locks as before). The optimization from this commit has long been contemplated by comments added by commit 2ed5b87f96, which changed the rules for locking/pinning during nbtree index scans. The approach that that commit introduced to leaf level link traversal when scanning forwards is now more or less applied all the time, regardless of the direction we're scanning in. Following uniform conventions around sibling link traversal is simpler. The only real remaining difference between our forward and backwards handling is that our backwards handling must still detect and recover from any concurrent left sibling splits (and concurrent page deletions), as documented in the nbtree README. That is structured as a single, isolated extra step that takes place in _bt_readnextpage. Also use this opportunity to further simplify the functions that deal with reading pages and traversing sibling links on the leaf level, and to document their preconditions and postconditions (with respect to things like buffer locks, buffer pins, and seizing the parallel scan). This enhancement completely supersedes the one recently added by commit 3f44959f. Author: Matthias van de Meent Author: Peter Geoghegan Discussion: https://postgr.es/m/CAEze2WgpBGRgTTxTWVPXc9+PB6fc1a7t+VyGXHzfnrFXcQVxnA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkBTuFv7W2+84jJT8mWZLXVL0GHq2hMUTn6c9Vw=eYrCw@mail.gmail.com --- src/backend/access/nbtree/nbtree.c | 75 ++- src/backend/access/nbtree/nbtsearch.c | 715 +++++++++++--------------- src/backend/access/nbtree/nbtutils.c | 2 +- src/include/access/nbtree.h | 62 +-- 4 files changed, 388 insertions(+), 466 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 9b968aa0f29..f4f79f27062 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -66,7 +66,9 @@ typedef enum */ typedef struct BTParallelScanDescData { - BlockNumber btps_scanPage; /* latest or next page to be scanned */ + BlockNumber btps_nextScanPage; /* next page to be scanned */ + BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into + * btps_nextScanPage */ BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ @@ -550,7 +552,8 @@ btinitparallelscan(void *target) BTParallelScanDesc bt_target = (BTParallelScanDesc) target; SpinLockInit(&bt_target->btps_mutex); - bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_nextScanPage = InvalidBlockNumber; + bt_target->btps_lastCurrPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; ConditionVariableInit(&bt_target->btps_cv); } @@ -575,7 +578,8 @@ btparallelrescan(IndexScanDesc scan) * consistency. */ SpinLockAcquire(&btscan->btps_mutex); - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; SpinLockRelease(&btscan->btps_mutex); } @@ -591,18 +595,20 @@ btparallelrescan(IndexScanDesc scan) * start just yet (only backends that call from _bt_first are capable of * starting primitive index scans, which they indicate by passing first=true). * - * If the return value is true, *pageno returns the next or current page - * of the scan (depending on the scan direction). An invalid block number - * means the scan hasn't yet started, or that caller needs to start the next - * primitive index scan (if it's the latter case we'll set so.needPrimScan). - * The first time a participating process reaches the last page, it will return - * true and set *pageno to P_NONE; after that, further attempts to seize the - * scan will return false. + * If the return value is true, *next_scan_page returns the next page of the + * scan, and *last_curr_page returns the page that *next_scan_page came from. + * An invalid *next_scan_page means the scan hasn't yet started, or that + * caller needs to start the next primitive index scan (if it's the latter + * case we'll set so.needPrimScan). The first time a participating process + * reaches the last page, it will return true and set *next_scan_page to + * P_NONE; after that, further attempts to seize the scan will return false. * - * Callers should ignore the value of pageno if the return value is false. + * Callers should ignore the value of *next_scan_page and *last_curr_page if + * the return value is false. */ bool -_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, + BlockNumber *last_curr_page, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool exit_loop = false; @@ -610,7 +616,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; - *pageno = P_NONE; + *next_scan_page = P_NONE; + *last_curr_page = InvalidBlockNumber; + + /* + * Reset so->currPos, and initialize moreLeft/moreRight such that the next + * call to _bt_readnextpage treats this backend similarly to a serial + * backend that steps from *last_curr_page to *next_scan_page (unless this + * backend's so->currPos is initialized by _bt_readfirstpage before then). + */ + BTScanPosInvalidate(so->currPos); + so->currPos.moreLeft = so->currPos.moreRight = true; if (first) { @@ -660,7 +676,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) array->cur_elem = btscan->btps_arrElems[i]; skey->sk_argument = array->elem_values[array->cur_elem]; } - *pageno = InvalidBlockNumber; + *next_scan_page = InvalidBlockNumber; exit_loop = true; } else @@ -688,7 +704,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) * of advancing it to a new page! */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; - *pageno = btscan->btps_scanPage; + *next_scan_page = btscan->btps_nextScanPage; + *last_curr_page = btscan->btps_lastCurrPage; exit_loop = true; } SpinLockRelease(&btscan->btps_mutex); @@ -703,17 +720,21 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) /* * _bt_parallel_release() -- Complete the process of advancing the scan to a - * new page. We now have the new value btps_scanPage; some other backend + * new page. We now have the new value btps_nextScanPage; another backend * can now begin advancing the scan. * - * Callers whose scan uses array keys must save their scan_page argument so + * Callers whose scan uses array keys must save their curr_page argument so * that it can be passed to _bt_parallel_primscan_schedule, should caller - * determine that another primitive index scan is required. If that happens, - * scan_page won't be scanned by any backend (unless the next primitive index - * scan lands on scan_page). + * determine that another primitive index scan is required. + * + * Note: unlike the serial case, parallel scans don't need to remember both + * sibling links. next_scan_page is whichever link is next given the scan's + * direction. That's all we'll ever need, since the direction of a parallel + * scan can never change. */ void -_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, + BlockNumber curr_page) { ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; @@ -722,7 +743,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); - btscan->btps_scanPage = scan_page; + btscan->btps_nextScanPage = next_scan_page; + btscan->btps_lastCurrPage = curr_page; btscan->btps_pageStatus = BTPARALLEL_IDLE; SpinLockRelease(&btscan->btps_mutex); ConditionVariableSignal(&btscan->btps_cv); @@ -778,13 +800,13 @@ _bt_parallel_done(IndexScanDesc scan) /* * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Caller passes the block number most recently passed to _bt_parallel_release + * Caller passes the curr_page most recently passed to _bt_parallel_release * by its backend. Caller successfully schedules the next primitive index scan * if the shared parallel state hasn't been seized since caller's backend last * advanced the scan. */ void -_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -796,10 +818,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_scanPage == prev_scan_page && + if (btscan->btps_lastCurrPage == curr_page && btscan->btps_pageStatus == BTPARALLEL_IDLE) { - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; /* Serialize scan's current array keys */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index a662665b55f..2275553be78 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -43,12 +43,13 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); -static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, - ScanDirection dir); -static Buffer _bt_walk_left(Relation rel, Buffer buf); +static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, + ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir); +static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, + BlockNumber lastcurrblkno); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); -static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); /* @@ -880,7 +881,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Buffer buf; BTStack stack; OffsetNumber offnum; StrategyNumber strat; @@ -889,10 +889,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysz = 0; int i; - bool status; StrategyNumber strat_total; BTScanPosItem *currItem; - BlockNumber blkno; Assert(!BTScanPosIsValid(so->currPos)); @@ -924,7 +922,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (scan->parallel_scan != NULL) { - status = _bt_parallel_seize(scan, &blkno, true); + BlockNumber blkno, + lastcurrblkno; + bool status; + + status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true); /* * Initialize arrays (when _bt_parallel_seize didn't already set up @@ -942,7 +944,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } else if (blkno != InvalidBlockNumber) { - if (!_bt_parallel_readpage(scan, blkno, dir)) + Assert(!so->needPrimScan); + + /* + * We anticipated starting another primitive scan, but some other + * worker bet us to it + */ + if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir)) return false; goto readcomplete; } @@ -1392,12 +1400,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * position ourselves on the target leaf page. */ Assert(ScanDirectionIsBackward(dir) == inskey.backward); - stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ); + stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); - if (!BufferIsValid(buf)) + if (!BufferIsValid(so->currPos.buf)) { /* * We only get here if the index is completely empty. Lock relation @@ -1411,11 +1419,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (IsolationIsSerializable()) { PredicateLockRelation(rel, scan->xs_snapshot); - stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ); + stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ); _bt_freestack(stack); } - if (!BufferIsValid(buf)) + if (!BufferIsValid(so->currPos.buf)) { /* * Mark parallel scan as done, so that all the workers can finish @@ -1427,17 +1435,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } - PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - - _bt_initialize_more_data(so, dir); - /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, &inskey, buf); - Assert(!BTScanPosIsValid(so->currPos)); - so->currPos.buf = buf; + offnum = _bt_binsrch(rel, &inskey, so->currPos.buf); /* - * Now load data from the first page of the scan. + * Now load data from the first page of the scan (usually the page + * currently in so->currPos.buf). * * If inskey.nextkey = false and inskey.backward = false, offnum is * positioned at the first non-pivot tuple >= inskey.scankeys. @@ -1455,24 +1458,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * for the page. For example, when inskey is both < the leaf page's high * key and > all of its non-pivot tuples, offnum will be "maxoff + 1". */ - if (!_bt_readpage(scan, dir, offnum, true)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* We have at least one item to return as scan's first item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } + if (!_bt_readfirstpage(scan, offnum, dir)) + return false; readcomplete: /* OK, itemIndex says what to return */ + Assert(BTScanPosIsValid(so->currPos)); currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; if (scan->xs_want_itup) @@ -1492,8 +1483,9 @@ readcomplete: * heap tuple, and if requested, scan->xs_itup points to a copy of the * index tuple. so->currPos is updated as needed. * - * On failure exit (no more tuples), we release pin and set - * so->currPos.buf to InvalidBuffer. + * On failure exit (no more tuples), we invalidate so->currPos. It'll + * still be possible for the scan to return tuples by changing direction, + * though we'll need to call _bt_first anew in that other direction. */ bool _bt_next(IndexScanDesc scan, ScanDirection dir) @@ -1523,6 +1515,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) } /* OK, itemIndex says what to return */ + Assert(BTScanPosIsValid(so->currPos)); currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; if (scan->xs_want_itup) @@ -1545,14 +1538,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * that there can be no more matching tuples in the current scan direction * (could just be for the current primitive index scan when scan has arrays). * - * _bt_first caller passes us an offnum returned by _bt_binsrch, which might - * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. - * _bt_checkkeys will stop the scan as soon as an equality qual fails (when - * its scan key was marked required), so _bt_first _must_ pass us an offnum - * exactly at the beginning of where equal tuples are to be found. When we're - * passed an offnum past the end of the page, we might still manage to stop - * the scan on this page by calling _bt_checkkeys against the high key. - * * In the case of a parallel scan, caller must have called _bt_parallel_seize * prior to calling this function; this function will invoke * _bt_parallel_release before returning. @@ -1563,6 +1548,7 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; @@ -1573,27 +1559,36 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, int itemIndex, indnatts; - /* - * We must have the buffer pinned and locked, but the usual macro can't be - * used here; this function is what makes it good for currPos. - */ - Assert(BufferIsValid(so->currPos.buf)); - + /* save the page/buffer block number, along with its sibling links */ page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + so->currPos.prevPage = opaque->btpo_prev; + so->currPos.nextPage = opaque->btpo_next; + + Assert(!P_IGNORE(opaque)); + Assert(BTScanPosIsPinned(so->currPos)); - /* allow next page be processed by parallel worker */ if (scan->parallel_scan) { + /* allow next/prev page to be read by other worker without delay */ if (ScanDirectionIsForward(dir)) - pstate.prev_scan_page = opaque->btpo_next; + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); else - pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); - - _bt_parallel_release(scan, pstate.prev_scan_page); + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); } - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + /* initialize remaining currPos fields (before moreLeft/moreright) */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + + PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -1612,35 +1607,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.rechecks = 0; pstate.targetdistance = 0; - /* - * We note the buffer's block number so that we can release the pin later. - * This allows us to re-read the buffer if it is needed again for hinting. - */ - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); - - /* - * We save the LSN of the page as we read it, so that we know whether it - * safe to apply LP_DEAD hints to the page later. This allows us to drop - * the pin for MVCC scans, which allows vacuum to avoid blocking. - */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - - /* - * we must save the page's right-link while scanning it; this tells us - * where to step right to after we're done with these items. There is no - * corresponding need for the left-link, since splits always go right. - */ - so->currPos.nextPage = opaque->btpo_next; - - /* initialize tuple workspace to empty */ - so->currPos.nextTupleOffset = 0; - - /* - * Now that the current page has been made consistent, the macro should be - * good. - */ - Assert(BTScanPosIsPinned(so->currPos)); - /* * Prechecking the value of the continuescan flag for the last item on the * page (for backwards scan it will be the first item on a page). If we @@ -1829,7 +1795,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, IndexTuple itup = (IndexTuple) PageGetItem(page, iid); int truncatt; - truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); + truncatt = BTreeTupleGetNAtts(itup, rel); pstate.prechecked = false; /* precheck didn't cover HIKEY */ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } @@ -1959,7 +1925,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * We don't need to visit page to the left when no more matches will * be found there */ - if (!pstate.continuescan || P_LEFTMOST(opaque)) + if (!pstate.continuescan) so->currPos.moreLeft = false; Assert(itemIndex >= 0); @@ -2060,20 +2026,25 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, /* * _bt_steppage() -- Step to next page containing valid data for scan * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; - * if pinned, we'll drop the pin before moving to next page. The buffer is - * not locked on entry. + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. + * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped + * the pin eagerly earlier on. The scan must have so->currPos.currPage set to + * a valid block, in any case. * - * For success on a scan using a non-MVCC snapshot we hold a pin, but not a - * read lock, on that page. If we do not hold the pin, we set so->currPos.buf - * to InvalidBuffer. We return true to indicate success. + * This is a wrapper on _bt_readnextpage that performs final steps for the + * current page. It sets up the _bt_readnextpage call using either local + * state saved in so->currPos by the most recent _bt_readpage call, or using + * shared parallel scan state (obtained by seizing the parallel scan here). + * + * Parallel scan callers that have already seized the scan should directly + * call _bt_readnextpage, rather than calling here. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BlockNumber blkno = InvalidBlockNumber; - bool status; + BlockNumber blkno, + lastcurrblkno; Assert(BTScanPosIsValid(so->currPos)); @@ -2115,7 +2086,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * In effect, btrestpos leaves advancing the arrays up to the first * _bt_readpage call (that takes place after it has restored markPos). */ - Assert(so->markPos.dir == dir); if (so->needPrimScan) { if (ScanDirectionIsForward(dir)) @@ -2123,319 +2093,288 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) else so->markPos.moreLeft = true; } + + /* mark/restore not supported by parallel scans */ + Assert(!scan->parallel_scan); } - if (ScanDirectionIsForward(dir)) + BTScanPosUnpinIfPinned(so->currPos); + + /* Walk to the next page with data */ + if (!scan->parallel_scan) { - /* Walk right to the next page with data */ - if (scan->parallel_scan != NULL) - { - /* - * Seize the scan to get the next block number; if the scan has - * ended already, bail out. - */ - status = _bt_parallel_seize(scan, &blkno, false); - if (!status) - { - /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - return false; - } - } - else - { - /* Not parallel, so use the previously-saved nextPage link. */ + /* Not parallel, so use local state set by the last _bt_readpage */ + if (ScanDirectionIsForward(dir)) blkno = so->currPos.nextPage; - } + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; + } + else + { + /* + * Seize the scan to get the nextPage and currPage from shared + * parallel state (saved from parallel scan's last _bt_readpage) + */ + if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + return false; + } - /* Remember we left a page with data */ - so->currPos.moreLeft = true; + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir); +} + +/* + * _bt_readfirstpage() -- Read first page containing valid data for _bt_first + * + * _bt_first caller passes us an offnum returned by _bt_binsrch, which might + * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. + * _bt_checkkeys will stop the scan as soon as an equality qual fails (when + * its scan key was marked required), so _bt_first _must_ pass us an offnum + * exactly at the beginning of where equal tuples are to be found. When we're + * passed an offnum past the end of the page, we might still manage to stop + * the scan on this page by calling _bt_checkkeys against the high key. See + * _bt_readpage for full details. + * + * On entry, so->currPos must be pinned and locked (so offnum stays valid). + * Parallel scan callers must have seized the scan before calling here. + * + * On exit, we'll have updated so->currPos and retained locks and pins + * according to the same rules as those laid out for _bt_readnextpage exit. + * Like _bt_readnextpage, our return value indicates if there are any matching + * records in the given direction. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we'll call _bt_parallel_release as soon as possible. + */ +static bool +_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ + + /* Initialize currPos for so->currPos */ + if (so->needPrimScan) + { + Assert(so->numArrayKeys); + + so->currPos.moreLeft = true; + so->currPos.moreRight = true; + so->needPrimScan = false; } - else + else if (ScanDirectionIsForward(dir)) { - /* Remember we left a page with data */ + so->currPos.moreLeft = false; so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } - if (scan->parallel_scan != NULL) - { - /* - * Seize the scan to get the current block number; if the scan has - * ended already, bail out. - */ - status = _bt_parallel_seize(scan, &blkno, false); - BTScanPosUnpinIfPinned(so->currPos); - if (!status) - { - BTScanPosInvalidate(so->currPos); - return false; - } - } - else - { - /* Not parallel, so just use our own notion of the current page */ - blkno = so->currPos.currPage; - } + /* + * Attempt to load matching tuples from the page in so->currPos.buf. + * + * Note that _bt_readpage will finish initializing the so->currPos fields. + * _bt_readpage also releases parallel scan (even when it returns false). + */ + if (_bt_readpage(scan, dir, offnum, true)) + { + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(BTScanPosIsPinned(so->currPos)); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return true; } - if (!_bt_readnextpage(scan, blkno, dir)) - return false; + /* There's no actually-matching data on the page in so->currPos.buf */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - /* We have at least one item to return as scan's next item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + /* Call _bt_readnextpage using its _bt_steppage wrapper function */ + if (!_bt_steppage(scan, dir)) + return false; + /* _bt_readpage for a later page (now in so->currPos) succeeded */ return true; } /* - * _bt_readnextpage() -- Read next page containing valid data for scan + * _bt_readnextpage() -- Read next page containing valid data for _bt_next + * + * Caller's blkno is the next interesting page's link, taken from either the + * previously-saved right link or left link. lastcurrblkno is the page that + * was current at the point where the blkno link was saved, which we use to + * reason about concurrent page splits/page deletions during backwards scans + * (_bt_parallel_seize also requires it, regardless of scan direction). + * + * On entry, caller shouldn't hold any locks or pins on any page (we work + * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * must have seized the scan before calling here (blkno and lastcurrblkno + * arguments should come from the seized scan). * * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true. Caller must release the lock (and - * maybe the pin) on the buffer on success exit. + * interesting page, and we return true (parallel scan callers should not use + * so->currPos to determine which page to scan next, though). We hold a pin + * on the buffer on success exit, except when _bt_drop_lock_and_maybe_pin + * decided it was safe to eagerly drop the pin (to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all - * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + * locks and pins, invalidate so->currPos, and return false. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we'll call _bt_parallel_release as soon as possible. */ static bool -_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel; Page page; BTPageOpaque opaque; - bool status; - rel = scan->indexRelation; + Assert(so->currPos.currPage == lastcurrblkno || scan->parallel_scan != NULL); + Assert(!BTScanPosIsPinned(so->currPos)); + /* + * Remember that the scan already read lastcurrblkno, a page to the left + * of blkno (or remember reading a page to the right, for backwards scans) + */ if (ScanDirectionIsForward(dir)) - { - for (;;) - { - /* - * if we're at end of scan, give up and mark parallel scan as - * done, so that all the workers can finish their scan - */ - if (blkno == P_NONE || !so->currPos.moreRight) - { - _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); - return false; - } - /* check for interrupts while we're not holding any buffer lock */ - CHECK_FOR_INTERRUPTS(); - /* step right one page */ - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(so->currPos.buf); - opaque = BTPageGetOpaque(page); - /* check for deleted page */ - if (!P_IGNORE(opaque)) - { - PredicateLockPage(rel, blkno, scan->xs_snapshot); - /* see if there are any matches on this page */ - /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false)) - break; - } - else if (scan->parallel_scan != NULL) - { - /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, opaque->btpo_next); - } - - /* nope, keep going */ - if (scan->parallel_scan != NULL) - { - _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno, false); - if (!status) - { - BTScanPosInvalidate(so->currPos); - return false; - } - } - else - { - blkno = opaque->btpo_next; - _bt_relbuf(rel, so->currPos.buf); - } - } - } + so->currPos.moreLeft = true; else + so->currPos.moreRight = true; + + for (;;) { /* - * Should only happen in parallel cases, when some other backend - * advanced the scan. + * if we're at end of scan, give up and mark parallel scan as done, so + * that all the workers can finish their scan */ - if (so->currPos.currPage != blkno) - { - BTScanPosUnpinIfPinned(so->currPos); - so->currPos.currPage = blkno; - } - - /* Done if we know that the left sibling link isn't of interest */ - if (!so->currPos.moreLeft) + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !so->currPos.moreRight : !so->currPos.moreLeft)) { - BTScanPosUnpinIfPinned(so->currPos); _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } - /* - * Walk left to the next page with data. This is much more complex - * than the walk-right case because of the possibility that the page - * to our left splits while we are in flight to it, plus the - * possibility that the page we were on gets deleted after we leave - * it. See nbtree/README for details. - * - * It might be possible to rearrange this code to have less overhead - * in pinning and locking, but that would require capturing the left - * sibling block number when the page is initially read, and then - * optimistically starting there (rather than pinning the page twice). - * It is not clear that this would be worth the complexity. - */ - if (BTScanPosIsPinned(so->currPos)) - _bt_lockbuf(rel, so->currPos.buf, BT_READ); + if (ScanDirectionIsForward(dir)) + { + /* read blkno, but check for interrupts first */ + CHECK_FOR_INTERRUPTS(); + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } else - so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - - for (;;) { - /* Done if we know that the left sibling link isn't of interest */ - if (!so->currPos.moreLeft) + /* read blkno, avoiding race (also checks for interrupts) */ + so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno, + lastcurrblkno); + if (so->currPos.buf == InvalidBuffer) { - _bt_relbuf(rel, so->currPos.buf); + /* must have been a concurrent deletion of leftmost page */ _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } + } - /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); - - /* if we're physically at end of index, return failure */ - if (so->currPos.buf == InvalidBuffer) + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + lastcurrblkno = blkno; + if (likely(!P_IGNORE(opaque))) + { + /* see if there are any matches on this page */ + if (ScanDirectionIsForward(dir)) { - _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); - return false; + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false)) + break; + blkno = so->currPos.nextPage; } - - /* - * Okay, we managed to move left to a non-deleted page. Done if - * it's not half-dead and contains matching tuples. Else loop back - * and do it all again. - */ - page = BufferGetPage(so->currPos.buf); - opaque = BTPageGetOpaque(page); - if (!P_IGNORE(opaque)) + else { - PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); - /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false)) break; + blkno = so->currPos.prevPage; } - else if (scan->parallel_scan != NULL) - { - /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); - } - - /* - * For parallel scans, get the last page scanned as it is quite - * possible that by the time we try to seize the scan, some other - * worker has already advanced the scan to a different page. We - * must continue based on the latest page scanned by any worker. - */ + } + else + { + /* _bt_readpage not called, so do all this for ourselves */ + if (ScanDirectionIsForward(dir)) + blkno = opaque->btpo_next; + else + blkno = opaque->btpo_prev; if (scan->parallel_scan != NULL) - { - _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno, false); - if (!status) - { - BTScanPosInvalidate(so->currPos); - return false; - } - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - } + _bt_parallel_release(scan, blkno, lastcurrblkno); } - } - return true; -} + /* no matching tuples on this page */ + _bt_relbuf(rel, so->currPos.buf); -/* - * _bt_parallel_readpage() -- Read current page containing valid data for scan - * - * On success, release lock and maybe pin on buffer. We return true to - * indicate success. - */ -static bool -_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(!so->needPrimScan); - - _bt_initialize_more_data(so, dir); - - if (!_bt_readnextpage(scan, blkno, dir)) - return false; + /* parallel scan seizes another page (won't use so->currPos blkno) */ + if (scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } - /* We have at least one item to return as scan's next item */ + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(so->currPos.currPage == blkno); + Assert(BTScanPosIsPinned(so->currPos)); _bt_drop_lock_and_maybe_pin(scan, &so->currPos); return true; } /* - * _bt_walk_left() -- step left one page, if possible + * _bt_lock_and_validate_left() -- lock caller's left sibling blkno, + * recovering from concurrent page splits/page deletions when necessary + * + * Called during backwards scans, to deal with their unique concurrency rules. * - * The given buffer must be pinned and read-locked. This will be dropped - * before stepping left. On return, we have pin and read lock on the - * returned page, instead. + * blkno points to the block number of the page that we expect to move the + * scan to. We'll successfully move the scan there when we find that its + * right sibling link still points to lastcurrblkno (the page we just read). + * Otherwise, we have to figure out which page is the correct one for the scan + * to now read the hard way, reasoning about concurrent splits and deletions. + * See nbtree/README. * - * Returns InvalidBuffer if there is no page to the left (no lock is held - * in that case). + * On return, we have both a pin and a read lock on the returned page, whose + * block number will be set in *blkno. Returns InvalidBuffer if there is no + * page to the left (no lock or pin is held in that case). * * It is possible for the returned leaf page to be half-dead; caller must * check that condition and step left again when required. */ static Buffer -_bt_walk_left(Relation rel, Buffer buf) +_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, + BlockNumber lastcurrblkno) { - Page page; - BTPageOpaque opaque; - - page = BufferGetPage(buf); - opaque = BTPageGetOpaque(page); + BlockNumber origblkno = *blkno; /* detects circular links */ for (;;) { - BlockNumber obknum; - BlockNumber lblkno; - BlockNumber blkno; + Buffer buf; + Page page; + BTPageOpaque opaque; int tries; - /* if we're at end of tree, release buf and return failure */ - if (P_LEFTMOST(opaque)) - { - _bt_relbuf(rel, buf); - break; - } - /* remember original page we are stepping left from */ - obknum = BufferGetBlockNumber(buf); - /* step left */ - blkno = lblkno = opaque->btpo_prev; - _bt_relbuf(rel, buf); /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - buf = _bt_getbuf(rel, blkno, BT_READ); + buf = _bt_getbuf(rel, *blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -2443,7 +2382,7 @@ _bt_walk_left(Relation rel, Buffer buf) * If this isn't the page we want, walk right till we find what we * want --- but go no more than four hops (an arbitrary limit). If we * don't find the correct page by then, the most likely bet is that - * the original page got deleted and isn't in the sibling chain at all + * lastcurrblkno got deleted and isn't in the sibling chain at all * anymore, not that its left sibling got split more than four times. * * Note that it is correct to test P_ISDELETED not P_IGNORE here, @@ -2452,21 +2391,27 @@ _bt_walk_left(Relation rel, Buffer buf) tries = 0; for (;;) { - if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + if (likely(!P_ISDELETED(opaque) && + opaque->btpo_next == lastcurrblkno)) { /* Found desired page, return it */ return buf; } if (P_RIGHTMOST(opaque) || ++tries > 4) break; - blkno = opaque->btpo_next; - buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + /* step right */ + *blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); } - /* Return to the original page to see what's up */ - buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); + /* + * Return to the original page (usually the page most recently read by + * _bt_readpage, which is passed by caller as lastcurrblkno) to see + * what's up with its prev sibling link + */ + buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); if (P_ISDELETED(opaque)) @@ -2482,31 +2427,42 @@ _bt_walk_left(Relation rel, Buffer buf) if (P_RIGHTMOST(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); - blkno = opaque->btpo_next; - buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + lastcurrblkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); if (!P_ISDELETED(opaque)) break; } - - /* - * Now return to top of loop, resetting obknum to point to this - * nondeleted page, and try again. - */ } else { /* - * It wasn't deleted; the explanation had better be that the page - * to the left got split or deleted. Without this check, we'd go - * into an infinite loop if there's anything wrong. + * Original lastcurrblkno wasn't deleted; the explanation had + * better be that the page to the left got split or deleted. + * Without this check, we risk going into an infinite loop. */ - if (opaque->btpo_prev == lblkno) + if (opaque->btpo_prev == origblkno) elog(ERROR, "could not find left sibling of block %u in index \"%s\"", - obknum, RelationGetRelationName(rel)); - /* Okay to try again with new lblkno value */ + lastcurrblkno, RelationGetRelationName(rel)); + /* Okay to try again, since left sibling link changed */ } + + /* + * Original lastcurrblkno from caller was concurrently deleted (could + * also have been a great many concurrent left sibling page splits). + * Found a non-deleted page that should now act as our lastcurrblkno. + */ + if (P_LEFTMOST(opaque)) + { + /* New lastcurrblkno has no left sibling (concurrently deleted) */ + _bt_relbuf(rel, buf); + break; + } + + /* Start from scratch with new lastcurrblkno's blkno/prev link */ + *blkno = origblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); } return InvalidBuffer; @@ -2606,7 +2562,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; @@ -2617,9 +2572,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); + so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); - if (!BufferIsValid(buf)) + if (!BufferIsValid(so->currPos.buf)) { /* * Empty index. Lock the whole relation, as nothing finer to lock @@ -2630,8 +2585,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } - PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - page = BufferGetPage(buf); + page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); Assert(P_ISLEAF(opaque)); @@ -2654,31 +2608,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) start = 0; /* keep compiler quiet */ } - /* remember which buffer we have pinned */ - so->currPos.buf = buf; - - _bt_initialize_more_data(so, dir); - /* * Now load data from the first page of the scan. */ - if (!_bt_readpage(scan, dir, start, true)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* We have at least one item to return as scan's first item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } + if (!_bt_readfirstpage(scan, start, dir)) + return false; /* OK, itemIndex says what to return */ + Assert(BTScanPosIsValid(so->currPos)); currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; if (scan->xs_want_itup) @@ -2686,33 +2623,3 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return true; } - -/* - * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir - * from currPos - */ -static inline void -_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) -{ - so->currPos.dir = dir; - if (so->needPrimScan) - { - Assert(so->numArrayKeys); - - so->currPos.moreLeft = true; - so->currPos.moreRight = true; - so->needPrimScan = false; - } - else if (ScanDirectionIsForward(dir)) - { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; - } - else - { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; - } - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ -} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 61ded00ca24..76be65123c8 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2407,7 +2407,7 @@ new_prim_scan: so->needPrimScan = true; /* ...but call _bt_first again */ if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page); + _bt_parallel_primscan_schedule(scan, so->currPos.currPage); /* Caller's tuple doesn't match the new qual */ return false; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 90a68375a2e..5fb523ecec3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -925,13 +925,13 @@ typedef BTVacuumPostingData *BTVacuumPosting; * Index scans work a page at a time: we pin and read-lock the page, identify * all the matching items on the page and save them in BTScanPosData, then * release the read-lock while returning the items to the caller for - * processing. This approach minimizes lock/unlock traffic. Note that we - * keep the pin on the index page until the caller is done with all the items - * (this is needed for VACUUM synchronization, see nbtree/README). When we - * are ready to step to the next page, if the caller has told us any of the - * items were killed, we re-lock the page to mark them killed, then unlock. - * Finally we drop the pin and step to the next page in the appropriate - * direction. + * processing. This approach minimizes lock/unlock traffic. We must always + * drop the lock to make it okay for caller to process the returned items. + * Whether or not we can also release the pin during this window will vary. + * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM + * (see nbtree/README section about making concurrent TID recycling safe). + * We'll always release both the lock and the pin on the current page before + * moving on to its sibling page. * * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go @@ -950,28 +950,15 @@ typedef struct BTScanPosItem /* what we remember about each match */ typedef struct BTScanPosData { - Buffer buf; /* if valid, the buffer is pinned */ + Buffer buf; /* currPage buf (invalid means unpinned) */ - XLogRecPtr lsn; /* pos in the WAL stream when page was read */ + /* page details as of the saved position's call to _bt_readpage */ BlockNumber currPage; /* page referenced by items array */ - BlockNumber nextPage; /* page's right link when we scanned it */ + BlockNumber prevPage; /* currPage's left link */ + BlockNumber nextPage; /* currPage's right link */ + XLogRecPtr lsn; /* currPage's LSN */ - /* - * moreLeft and moreRight track whether we think there may be matching - * index entries to the left and right of the current page, respectively. - * We can clear the appropriate one of these flags when _bt_checkkeys() - * sets BTReadPageState.continuescan = false. - */ - bool moreLeft; - bool moreRight; - - /* - * Direction of the scan at the time that _bt_readpage was called. - * - * Used by btrestrpos to "restore" the scan's array keys by resetting each - * array to its first element's value (first in this scan direction). This - * avoids the need to directly track the array keys in btmarkpos. - */ + /* scan direction for the saved position's call to _bt_readpage */ ScanDirection dir; /* @@ -980,6 +967,13 @@ typedef struct BTScanPosData */ int nextTupleOffset; + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + */ + bool moreLeft; + bool moreRight; + /* * The items array is always ordered in index order (ie, increasing * indexoffset). When scanning backwards it is convenient to fill the @@ -1021,11 +1015,8 @@ typedef BTScanPosData *BTScanPos; ) #define BTScanPosInvalidate(scanpos) \ do { \ - (scanpos).currPage = InvalidBlockNumber; \ - (scanpos).nextPage = InvalidBlockNumber; \ (scanpos).buf = InvalidBuffer; \ - (scanpos).lsn = InvalidXLogRecPtr; \ - (scanpos).nextTupleOffset = 0; \ + (scanpos).currPage = InvalidBlockNumber; \ } while (0) /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ @@ -1091,7 +1082,6 @@ typedef struct BTReadPageState OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ IndexTuple finaltup; /* Needed by scans with array keys */ - BlockNumber prev_scan_page; /* previous _bt_parallel_release block */ Page page; /* Page being read */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ @@ -1192,12 +1182,14 @@ extern int btgettreeheight(Relation rel); /* * prototypes for internal functions in nbtree.c */ -extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, - bool first); -extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, + BlockNumber *last_curr_page, bool first); +extern void _bt_parallel_release(IndexScanDesc scan, + BlockNumber next_scan_page, + BlockNumber curr_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, - BlockNumber prev_scan_page); + BlockNumber curr_page); /* * prototypes for functions in nbtdedup.c -- 2.30.2