diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 387d99723453..16fda28d4ad0 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/gist_private.h" #include "access/relscan.h" +#include "access/tableam.h" #include "lib/pairingheap.h" #include "miscadmin.h" #include "pgstat.h" @@ -394,10 +395,14 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, return; } - so->nPageData = so->curPageData = 0; + if (scan->numberOfOrderBys) + so->os.nsortData = 0; + else + so->nos.nPageData = so->nos.curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); /* * We save the LSN of the page as we read it, so that we know whether it @@ -457,9 +462,9 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, /* * Non-ordered scan, so report tuples in so->pageData[] */ - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].recheck = recheck; - so->pageData[so->nPageData].offnum = i; + so->nos.pageData[so->nos.nPageData].heapPtr = it->t_tid; + so->nos.pageData[so->nos.nPageData].recheck = recheck; + so->nos.pageData[so->nos.nPageData].offnum = i; /* * In an index-only scan, also fetch the data from the tuple. The @@ -467,12 +472,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, */ if (scan->xs_want_itup) { - oldcxt = MemoryContextSwitchTo(so->pageDataCxt); - so->pageData[so->nPageData].recontup = + oldcxt = MemoryContextSwitchTo(so->nos.pageDataCxt); + so->nos.pageData[so->nos.nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); } - so->nPageData++; + so->nos.nPageData++; } else { @@ -501,7 +506,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + so->os.sortData[so->os.nsortData] = &item->data.heap; + so->os.nsortData += 1; + } } else { @@ -526,7 +535,101 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } - UnlockReleaseBuffer(buffer); + /* Allow writes to the buffer, but don't yet allow VACUUM */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * If we're in an index-only scan, we need to do visibility checks before + * we release the pin, so that VACUUM can't clean up dead tuples from this + * index page and mark the page ALL_VISIBLE before the tuple was returned. + * + * See also docs section "Index Locking Considerations". + */ + if (scan->xs_want_itup) + { + TM_IndexVisibilityCheckOp op; + op.vmbuf = &so->vmbuf; + + if (scan->numberOfOrderBys > 0) + { + op.checkntids = so->os.nsortData; + + if (op.checkntids > 0) + { + op.checktids = palloc(op.checkntids * sizeof(TM_VisCheck)); + + for (int off = 0; off < op.checkntids; off++) + { + Assert(ItemPointerIsValid(&so->os.sortData[off]->heapPtr)); + + PopulateTMVischeck(&op.checktids[off], + &so->os.sortData[off]->heapPtr, + off); + } + } + } + else + { + op.checkntids = so->nos.nPageData; + + if (op.checkntids > 0) + { + op.checktids = palloc_array(TM_VisCheck, op.checkntids); + + for (int off = 0; off < op.checkntids; off++) + { + Assert(ItemPointerIsValid(&so->nos.pageData[off].heapPtr)); + + PopulateTMVischeck(&op.checktids[off], + &so->nos.pageData[off].heapPtr, + off); + } + } + } + + if (op.checkntids > 0) + { + table_index_vischeck_tuples(scan->heapRelation, &op); + + if (scan->numberOfOrderBys > 0) + { + for (int off = 0; off < op.checkntids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = so->os.sortData[check->idxoffnum]; + + /* sanity checks */ + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr)); + + item->visrecheck = check->vischeckresult; + } + /* reset state */ + so->os.nsortData = 0; + } + else + { + for (int off = 0; off < op.checkntids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = &so->nos.pageData[check->idxoffnum]; + + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr)); + + item->visrecheck = check->vischeckresult; + } + } + + /* clean up the used resources */ + pfree(op.checktids); + } + } + + /* Allow VACUUM to process the buffer again */ + ReleaseBuffer(buffer); } /* @@ -588,7 +691,10 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { scan->xs_hitup = item->data.heap.recontup; + scan->xs_visrecheck = item->data.heap.visrecheck; + } res = true; } else @@ -629,10 +735,10 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) scan->instrument->nsearches++; so->firstCall = false; - so->curPageData = so->nPageData = 0; + so->nos.curPageData = so->nos.nPageData = 0; scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); @@ -649,9 +755,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) /* Fetch tuples index-page-at-a-time */ for (;;) { - if (so->curPageData < so->nPageData) + if (so->nos.curPageData < so->nos.nPageData) { - if (scan->kill_prior_tuple && so->curPageData > 0) + if (scan->kill_prior_tuple && so->nos.curPageData > 0) { if (so->killedItems == NULL) @@ -667,17 +773,20 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) } if (so->numKilled < MaxIndexTuplesPerPage) so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; + so->nos.pageData[so->nos.curPageData - 1].offnum; } /* continuing to return tuples from a leaf page */ - scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; + scan->xs_heaptid = so->nos.pageData[so->nos.curPageData].heapPtr; + scan->xs_recheck = so->nos.pageData[so->nos.curPageData].recheck; /* in an index-only scan, also return the reconstructed tuple */ if (scan->xs_want_itup) - scan->xs_hitup = so->pageData[so->curPageData].recontup; + { + scan->xs_hitup = so->nos.pageData[so->nos.curPageData].recontup; + scan->xs_visrecheck = so->nos.pageData[so->nos.curPageData].visrecheck; + } - so->curPageData++; + so->nos.curPageData++; return true; } @@ -687,8 +796,8 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) * necessary */ if (scan->kill_prior_tuple - && so->curPageData > 0 - && so->curPageData == so->nPageData) + && so->nos.curPageData > 0 + && so->nos.curPageData == so->nos.nPageData) { if (so->killedItems == NULL) @@ -704,7 +813,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) } if (so->numKilled < MaxIndexTuplesPerPage) so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; + so->nos.pageData[so->nos.curPageData - 1].offnum; } /* find and process the next index page */ do @@ -733,7 +842,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) gistScanPage(scan, item, item->distances, NULL, NULL); pfree(item); - } while (so->nPageData == 0); + } while (so->nos.nPageData == 0); } } } @@ -756,10 +865,10 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) scan->instrument->nsearches++; /* Begin the scan by processing the root page */ - so->curPageData = so->nPageData = 0; + so->nos.curPageData = so->nos.nPageData = 0; scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03d..a59672ce9791 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -204,9 +204,9 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, scan->xs_hitupdesc = so->giststate->fetchTupdesc; /* Also create a memory context that will hold the returned tuples */ - so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST page data context", - ALLOCSET_DEFAULT_SIZES); + so->nos.pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST page data context", + ALLOCSET_DEFAULT_SIZES); } /* create new, empty pairing heap for search queue */ @@ -347,6 +347,11 @@ void gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } /* * freeGISTstate is enough to clean up everything made by gistbeginscan, diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index dca236b6e573..dfe4acb64187 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -326,10 +326,10 @@ gistvacuumpage(GistVacState *vstate, Buffer buffer) recurse_to = InvalidBlockNumber; /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * We are not going to stay here for a long time, aggressively grab a + * cleanup lock. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c1a4de14a59e..34acd2c06c0e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -101,11 +101,37 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status uint16 infomask, Relation rel, int *remaining, bool logLockFailure); static void index_delete_sort(TM_IndexDeleteOp *delstate); +static inline int heap_ivc_process_block(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks); +static void heap_ivc_process_all(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks); static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); +/* sort template definitions for index */ +#define ST_SORT heap_ivc_sortby_tidheapblk +#define ST_ELEMENT_TYPE TM_VisCheck +#define ST_DECLARE +#define ST_DEFINE +#define ST_SCOPE static inline +#define ST_COMPARE(a, b) ( \ + a->tidblkno < b->tidblkno ? -1 : ( \ + a->tidblkno > b->tidblkno ? 1 : 0 \ + ) \ +) + +#include "lib/sort_template.h" + +#define ST_SORT heap_ivc_sortby_idx +#define ST_ELEMENT_TYPE TM_VisCheck +#define ST_DECLARE +#define ST_DEFINE +#define ST_SCOPE static inline +#define ST_COMPARE(a, b) (((int) a->idxoffnum) - ((int) b->idxoffnum)) +#include "lib/sort_template.h" + /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two @@ -8750,6 +8776,157 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } +/* + * heapam implementation of tableam's index_vischeck_tuples interface. + * + * This helper function is called by index AMs during index-only scans, + * to do VM-based visibility checks on individual tuples, so that the AM + * can hold the tuple in memory for e.g. reordering for extended periods of + * time while without holding thousands of pins to conflict with VACUUM. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * checking hundreds of tuples from a single index block, but that is + * preferred over holding thousands of pins. + * + * We use heuristics to balance the costs of sorting TIDs with VM page + * lookups. + */ +void +heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + Buffer vmbuf = *checkop->vmbuf; + Buffer storvmbuf = vmbuf; + TM_VisCheck *checks = checkop->checktids; + int checkntids = checkop->checkntids; + int upcomingvmbufchanges = 0; + + /* + * The first index scan will have to pin the VM buffer, and that first + * change in the vm buffer shouldn't put us into the expensive VM page & + * sort path; so we special-case this operation. + */ + if (!BufferIsValid(vmbuf)) + { + int processed; + processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids); + checkntids -= processed; + checks += processed; + storvmbuf = vmbuf; + Assert(processed > 0); + } + + while (vmbuf == storvmbuf && checkntids > 0) + { + int processed; + + processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids); + + Assert(processed <= checkntids); + + checkntids -= processed; + checks += processed; + } + + *checkop->vmbuf = vmbuf; + + if (checkntids == 0) + { + return; + } + + upcomingvmbufchanges = 0; + + for (int i = 1; i < checkntids; i++) + { + /* + * Instead of storing the previous iteration's result, we only match + * the block numbers + */ + BlockNumber lastblkno = checks[i - 1].tidblkno; + BlockNumber newblkno = checks[i].tidblkno; + /* + * divide-by-constant can be faster than BufferGetBlockNumber() + */ + BlockNumber lastvmblkno = HEAPBLK_TO_VMBLOCK(lastblkno); + BlockNumber newvmblkno = HEAPBLK_TO_VMBLOCK(newblkno); + + if (lastvmblkno != newvmblkno) + upcomingvmbufchanges++; + } + + if (upcomingvmbufchanges <= pg_ceil_log2_32(checkntids)) + { + /* + * No big amount of VM buf changes, so do all visibility checks + * without sorting. + */ + heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids); + + return; + } + + /* + * Order the TIDs to heap order, so that we will only need to visit every + * VM page at most once. + */ + heap_ivc_sortby_tidheapblk(checks, checkntids); + + /* do all visibility checks */ + heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids); + + /* put the checks back in index order */ + heap_ivc_sortby_idx(checks, checkntids); +} + + +static inline int +heap_ivc_process_block(Relation rel, Buffer *vmbuf, TM_VisCheck *checks, + int nchecks) +{ + BlockNumber blkno; + BlockNumber prevblkno = blkno = checks->tidblkno; + TMVC_Result result; + int processed = 0; + + if (VM_ALL_VISIBLE(rel, blkno, vmbuf)) + result = TMVC_Visible; + else + result = TMVC_MaybeVisible; + + do + { + checks->vischeckresult = result; + + nchecks--; + processed++; + checks++; + + if (nchecks <= 0) + return processed; + + blkno = checks->tidblkno; + } while (blkno == prevblkno); + + return processed; +} + +static void +heap_ivc_process_all(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks) +{ + while (nchecks > 0) + { + int processed; + + processed = heap_ivc_process_block(rel, vmbuf, checks, nchecks); + + Assert(processed <= nchecks); + + nchecks -= processed; + checks += processed; + } +} + /* * Perform XLogInsert for a heap-visible operation. 'block' is the block * being marked all-visible, and vm_buffer is the buffer containing the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77a..fe4b0b39da7f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = { .tuple_tid_valid = heapam_tuple_tid_valid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .index_delete_tuples = heap_index_delete_tuples, + .index_vischeck_tuples = heap_index_vischeck_tuples, .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e2..ae71c0a6d6eb 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -107,17 +107,6 @@ */ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) -/* Number of heap blocks we can represent in one byte */ -#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) - -/* Number of heap blocks we can represent in one visibility map page. */ -#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) - -/* Mapping from heap block number to the right bit in the visibility map */ -#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) -#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) -#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) - /* Masks for counting subsets of bits in the visibility map. */ #define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */ #define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */ @@ -137,9 +126,9 @@ static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks); bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + int mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); uint8 mask = flags << mapOffset; char *map; bool cleared = false; @@ -190,7 +179,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); /* Reuse the old pinned buffer if possible */ if (BufferIsValid(*vmbuf)) @@ -214,7 +203,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock; } @@ -247,9 +236,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); Page page; uint8 *map; uint8 status; @@ -340,9 +329,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); char *map; uint8 result; @@ -445,9 +434,9 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) BlockNumber newnblocks; /* last remaining block, byte, and bit */ - BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); - uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); - uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks); + BlockNumber truncBlock = HEAPBLK_TO_VMBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_VMBYTE(nheapblocks); + uint8 truncOffset = HEAPBLK_TO_VMOFFSET(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da6..61d1f08220de 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -628,6 +628,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* XXX: we should assert that a snapshot is pushed or registered */ Assert(TransactionIdIsValid(RecentXmin)); + /* + * Reset xs_visrecheck, so we don't confuse the next tuple's visibility + * state with that of the previous. + */ + scan->xs_visrecheck = TMVC_Unchecked; + /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_heaptid. It should also set diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af7..96217d7baf04 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -360,6 +360,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->vmbuf = InvalidBuffer; + so->vischeckcap = 0; + so->vischecksbuf = NULL; + /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up @@ -400,6 +404,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } + /* * Allocate tuple workspace arrays, if needed for an index-only scan and * not already done in a previous rescan call. To save on palloc @@ -451,6 +461,17 @@ btendscan(IndexScanDesc scan) so->markItemIndex = -1; BTScanPosUnpinIfPinned(so->markPos); + if (so->vischecksbuf) + pfree(so->vischecksbuf); + so->vischecksbuf = NULL; + so->vischeckcap = 0; + + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } + /* No need to invalidate positions, the RAM is about to be freed. */ /* Release storage */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 77264ddeecb5..ed173bf72462 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -54,6 +54,12 @@ static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +/* + * Execute vischecks at the index level? + * Enabled by default. + */ +#define DEBUG_IOS_VISCHECKS_ENABLED true + /* * _bt_drop_lock_and_maybe_pin() * @@ -64,13 +70,109 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * See nbtree/README section on making concurrent TID recycling safe. */ static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so) { _bt_unlockbuf(scan->indexRelation, sp->buf); + /* + * Do some visibility checks if this is an index-only scan; allowing us to + * drop the pin on this page before we have returned all tuples from this + * IOS to the executor. + */ + if (scan->xs_want_itup && DEBUG_IOS_VISCHECKS_ENABLED) + { + int initOffset = sp->firstItem; + int ntids = 1 + sp->lastItem - initOffset; + + if (ntids > 0) + { + TM_IndexVisibilityCheckOp visCheck; + Relation heaprel = scan->heapRelation; + TM_VisCheck *check; + BTScanPosItem *item; + + visCheck.checkntids = ntids; + + if (so->vischeckcap == 0) + { + so->vischecksbuf = palloc_array(TM_VisCheck, ntids); + so->vischeckcap = ntids; + } + else if (so->vischeckcap < visCheck.checkntids) + { + so->vischecksbuf = repalloc_array(so->vischecksbuf, + TM_VisCheck, ntids); + so->vischeckcap = ntids; + } + + visCheck.checktids = so->vischecksbuf; + visCheck.vmbuf = &so->vmbuf; + + check = so->vischecksbuf; + item = &so->currPos.items[initOffset]; + + for (int i = 0; i < visCheck.checkntids; i++) + { + Assert(item->visrecheck == TMVC_Unchecked); + Assert(ItemPointerIsValid(&item->heapTid)); + + PopulateTMVischeck(check, &item->heapTid, initOffset + i); + + item++; + check++; + } + + table_index_vischeck_tuples(heaprel, &visCheck); + check = so->vischecksbuf; + + for (int i = 0; i < visCheck.checkntids; i++) + { + item = &so->currPos.items[check->idxoffnum]; + /* We must have a valid visibility check result */ + Assert(check->vischeckresult != TMVC_Unchecked); + /* The offset number should still indicate the right item */ + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapTid)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapTid)); + + /* Store the visibility check result */ + item->visrecheck = check->vischeckresult; + check++; + } + } + } + + /* + * We may need to hold a pin on the page for one of several reasons: + * + * 1.) To safely apply kill_prior_tuple, we need to know that the tuples + * were not removed from the page (and subsequently re-inserted). + * A page's LSN can also allow us to detect modifications on the page, + * which then allows us to bail out of setting the hint bits, but that + * requires the index to be WAL-logged; so unless the index is WAL-logged + * we must hold a pin on the page to apply the kill_prior_tuple + * optimization. + * + * 2.) Non-MVCC scans need pin coupling to make sure the scan covers + * exactly the whole index keyspace. + * + * 3.) For Index-Only Scans, the scan needs to check the visibility of the + * table tuple while the relevant index tuple is guaranteed to still be + * contained in the index (so that vacuum hasn't yet marked any pages that + * could contain the value as ALL_VISIBLE after reclaiming a dead tuple + * that might be buffered in the scan). A pin must therefore be held + * at least while the basic visibility of the page's tuples is being + * checked. + * + * For cases 1 and 2, we must hold the pin after we've finished processing + * the index page. + * + * For case 3, we can release the pin if we first do the visibility checks + * of to-be-returned tuples using table_index_vischeck_tuples, which we've + * done just above. + */ if (IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + (!scan->xs_want_itup || DEBUG_IOS_VISCHECKS_ENABLED)) { ReleaseBuffer(sp->buf); sp->buf = InvalidBuffer; @@ -2001,6 +2103,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { Size itupsz = IndexTupleSize(itup); @@ -2031,6 +2135,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { /* Save base IndexTuple (truncate posting list) */ @@ -2067,6 +2173,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; /* * Have index-only scans return the same base IndexTuple for every TID @@ -2092,6 +2199,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) /* Return next item, per amgettuple contract */ scan->xs_heaptid = currItem->heapTid; + + if (scan->xs_want_itup) + { + scan->xs_visrecheck = currItem->visrecheck; + Assert(currItem->visrecheck != TMVC_Unchecked || + BufferIsValid(so->currPos.buf)); + } + if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); } @@ -2250,7 +2365,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, so); return true; } @@ -2407,7 +2522,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, so); return true; } diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 25893050c587..42784fdc4cac 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -30,7 +30,8 @@ typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isNull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances); + bool recheckDistances, double *distances, + TMVC_Result visrecheck); /* * Pairing heap comparison function for the SpGistSearchItem queue. @@ -142,6 +143,7 @@ spgAddStartItem(SpGistScanOpaque so, bool isnull) startEntry->traversalValue = NULL; startEntry->recheck = false; startEntry->recheckDistances = false; + startEntry->visrecheck = TMVC_Unchecked; spgAddSearchItemToQueue(so, startEntry); } @@ -386,6 +388,19 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); + /* prepare index-only scan requirements */ + so->nReorderThisPage = 0; + if (scan->xs_want_itup) + { + if (so->visrecheck == NULL) + so->visrecheck = palloc(MaxIndexTuplesPerPage); + + if (scan->numberOfOrderBys > 0 && so->items == NULL) + { + so->items = palloc_array(SpGistSearchItem *, MaxIndexTuplesPerPage); + } + } + /* initialize order-by data if needed */ if (orderbys && scan->numberOfOrderBys > 0) { @@ -453,6 +468,9 @@ spgendscan(IndexScanDesc scan) pfree(scan->xs_orderbynulls); } + if (BufferIsValid(so->vmbuf)) + ReleaseBuffer(so->vmbuf); + pfree(so); } @@ -502,6 +520,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, item->isLeaf = true; item->recheck = recheck; item->recheckDistances = recheckDistances; + item->visrecheck = TMVC_Unchecked; return item; } @@ -584,6 +603,14 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, isnull, distances); + if (so->want_itup) + { + Assert(PointerIsValid(so->items)); + + so->items[so->nReorderThisPage] = heapItem; + so->nReorderThisPage++; + } + spgAddSearchItemToQueue(so, heapItem); MemoryContextSwitchTo(oldCxt); @@ -593,7 +620,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* non-ordered scan, so report the item right away */ Assert(!recheckDistances); storeRes(so, &leafTuple->heapPtr, leafValue, isnull, - leafTuple, recheck, false, NULL); + leafTuple, recheck, false, NULL, TMVC_Unchecked); *reportedSome = true; } } @@ -806,6 +833,88 @@ spgTestLeafTuple(SpGistScanOpaque so, return SGLT_GET_NEXTOFFSET(leafTuple); } +/* + * Pupulate so->visrecheck based on tuples which are cached for a currently + * pinned page. + */ +static void +spgPopulateUnorderedVischecks(IndexScanDesc scan, SpGistScanOpaqueData *so) +{ + TM_IndexVisibilityCheckOp op; + + Assert(scan->numberOfOrderBys == 0); + + if (so->nPtrs == 0) + return; + + op.checkntids = so->nPtrs; + op.checktids = palloc_array(TM_VisCheck, so->nPtrs); + op.vmbuf = &so->vmbuf; + + for (int i = 0; i < op.checkntids; i++) + { + Assert(ItemPointerIsValid(&so->heapPtrs[i])); + + PopulateTMVischeck(&op.checktids[i], &so->heapPtrs[i], i); + } + + table_index_vischeck_tuples(scan->heapRelation, &op); + + for (int i = 0; i < op.checkntids; i++) + { + TM_VisCheck *check = &op.checktids[i]; + + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&so->heapPtrs[check->idxoffnum])); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&so->heapPtrs[check->idxoffnum])); + Assert(check->idxoffnum < op.checkntids); + + so->visrecheck[check->idxoffnum] = check->vischeckresult; + } + + pfree(op.checktids); +} + +/* pupulate so->visrecheck based on current cached tuples */ +static void +spgPopulateOrderedVisChecks(IndexScanDesc scan, SpGistScanOpaqueData *so) +{ + TM_IndexVisibilityCheckOp op; + + if (so->nReorderThisPage == 0) + return; + + Assert(so->nReorderThisPage > 0); + Assert(scan->numberOfOrderBys > 0); + Assert(PointerIsValid(so->items)); + + op.checkntids = so->nReorderThisPage; + op.checktids = palloc_array(TM_VisCheck, so->nReorderThisPage); + op.vmbuf = &so->vmbuf; + + for (int i = 0; i < op.checkntids; i++) + { + PopulateTMVischeck(&op.checktids[i], &so->items[i]->heapPtr, i); + Assert(ItemPointerIsValid(&so->items[i]->heapPtr)); + Assert(so->items[i]->isLeaf); + } + + table_index_vischeck_tuples(scan->heapRelation, &op); + + for (int i = 0; i < op.checkntids; i++) + { + TM_VisCheck *check = &op.checktids[i]; + + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&so->items[check->idxoffnum]->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&so->items[check->idxoffnum]->heapPtr)); + + so->items[check->idxoffnum]->visrecheck = check->vischeckresult; + } + + pfree(op.checktids); + so->nReorderThisPage = 0; +} + /* * Walk the tree and report all tuples passing the scan quals to the storeRes * subroutine. @@ -814,8 +923,8 @@ spgTestLeafTuple(SpGistScanOpaque so, * next page boundary once we have reported at least one tuple. */ static void -spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, - storeRes_func storeRes) +spgWalk(IndexScanDesc scan, Relation index, SpGistScanOpaque so, + bool scanWholeIndex, storeRes_func storeRes) { Buffer buffer = InvalidBuffer; bool reportedSome = false; @@ -835,9 +944,23 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, { /* We store heap items in the queue only in case of ordered search */ Assert(so->numberOfNonNullOrderBys > 0); + + /* + * If an item we found on a page is retrieved immediately after + * processing that page, we won't yet have released the page pin, + * and thus won't yet have processed the visibility data of the + * page's (now) ordered tuples. + * Do that now, so that all tuples on the page we're about to + * unpin were checked for visibility before we returned any. + */ + if (so->want_itup && so->nReorderThisPage) + spgPopulateOrderedVisChecks(scan, so); + + Assert(!so->want_itup || item->visrecheck != TMVC_Unchecked); storeRes(so, &item->heapPtr, item->value, item->isNull, item->leafTuple, item->recheck, - item->recheckDistances, item->distances); + item->recheckDistances, item->distances, + item->visrecheck); reportedSome = true; } else @@ -854,7 +977,15 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, } else if (blkno != BufferGetBlockNumber(buffer)) { - UnlockReleaseBuffer(buffer); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(so->numberOfOrderBys >= 0); + if (so->numberOfOrderBys == 0) + spgPopulateUnorderedVischecks(scan, so); + else + spgPopulateOrderedVisChecks(scan, so); + + ReleaseBuffer(buffer); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); } @@ -922,16 +1053,36 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, } if (buffer != InvalidBuffer) - UnlockReleaseBuffer(buffer); -} + { + /* Unlock the buffer for concurrent accesses except VACUUM */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + /* + * If we're in an index-only scan, pre-check visibility of the tuples, + * so we can drop the pin without causing visibility bugs. + */ + if (so->want_itup) + { + Assert(scan->numberOfOrderBys >= 0); + + if (scan->numberOfOrderBys == 0) + spgPopulateUnorderedVischecks(scan, so); + else + spgPopulateOrderedVisChecks(scan, so); + } + + /* Release the page */ + ReleaseBuffer(buffer); + } +} /* storeRes subroutine for getbitmap case */ static void storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances) + bool recheckDistances, double *distances, + TMVC_Result visres) { Assert(!recheckDistances && !distances); tbm_add_tuples(so->tbm, heapPtr, 1, recheck); @@ -949,7 +1100,7 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) so->tbm = tbm; so->ntids = 0; - spgWalk(scan->indexRelation, so, true, storeBitmap); + spgWalk(scan, scan->indexRelation, so, true, storeBitmap); return so->ntids; } @@ -959,12 +1110,15 @@ static void storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *nonNullDistances) + bool recheckDistances, double *nonNullDistances, + TMVC_Result visres) { Assert(so->nPtrs < MaxIndexTuplesPerPage); so->heapPtrs[so->nPtrs] = *heapPtr; so->recheck[so->nPtrs] = recheck; so->recheckDistances[so->nPtrs] = recheckDistances; + if (so->want_itup) + so->visrecheck[so->nPtrs] = visres; if (so->numberOfOrderBys > 0) { @@ -1041,6 +1195,10 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) scan->xs_heaptid = so->heapPtrs[so->iPtr]; scan->xs_recheck = so->recheck[so->iPtr]; scan->xs_hitup = so->reconTups[so->iPtr]; + if (so->want_itup) + scan->xs_visrecheck = so->visrecheck[so->iPtr]; + + Assert(!scan->xs_want_itup || scan->xs_visrecheck != TMVC_Unchecked); if (so->numberOfOrderBys > 0) index_store_float8_orderby_distances(scan, so->orderByTypes, @@ -1070,7 +1228,7 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) } so->iPtr = so->nPtrs = 0; - spgWalk(scan->indexRelation, so, false, storeGettuple); + spgWalk(scan, scan->indexRelation, so, false, storeGettuple); if (so->nPtrs == 0) break; /* must have completed scan */ diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2678f7ab7829..a29c3d798699 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -625,7 +625,7 @@ spgvacuumpage(spgBulkDeleteState *bds, Buffer buffer) BlockNumber blkno = BufferGetBlockNumber(buffer); Page page; - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 476663b66aad..b3ce90ceaeaa 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->tuple_get_latest_tid != NULL); Assert(routine->tuple_satisfies_snapshot != NULL); Assert(routine->index_delete_tuples != NULL); + Assert(routine->index_vischeck_tuples != NULL); Assert(routine->tuple_insert != NULL); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507a..e02fc1652ffd 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -121,6 +121,7 @@ IndexOnlyNext(IndexOnlyScanState *node) while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; + TMVC_Result vischeck = scandesc->xs_visrecheck; CHECK_FOR_INTERRUPTS(); @@ -128,6 +129,9 @@ IndexOnlyNext(IndexOnlyScanState *node) * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. + * The index may have already pre-checked the visibility of the tuple + * for us, and stored the result in xs_visrecheck, in which case we + * can skip the call. * * Note on Memory Ordering Effects: visibilitymap_get_status does not * lock the visibility map buffer, and therefore the result we read @@ -157,37 +161,60 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * The index doing these checks for us doesn't materially change these + * considerations. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ + if (vischeck == TMVC_Unchecked) + vischeck = table_index_vischeck_tuple(scandesc->heapRelation, + &node->ioss_VMBuffer, + tid); - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + Assert(vischeck != TMVC_Unchecked); - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; + switch (vischeck) + { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); + /* + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: + */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* + * Rats, we have to visit the heap to check visibility. + */ + InstrCountTuples2(node, 1); + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + continue; /* no visible tuple, try next index entry */ + + ExecClearTuple(node->ioss_TableSlot); + + /* + * Only MVCC snapshots are supported here, so there should be + * no need to keep following the HOT chain once a visible + * entry has been found. If we did want to allow that, we'd + * need to keep more state to remember not to call + * index_getnext_tid next time. + */ + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, + * as recorded in scandesc->xs_cbuf. We could release that + * pin now, but it's not clear whether it's a win to do so. + * The next index entry might require a visit to the same heap + * page. + */ + + tuple_from_heap = true; + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc6..035bd7a82be0 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6730,44 +6730,62 @@ get_actual_variable_endpoint(Relation heapRel, while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) { BlockNumber block = ItemPointerGetBlockNumber(tid); + TMVC_Result visres = index_scan->xs_visrecheck; - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) + if (visres == TMVC_Unchecked) + visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid); + + Assert(visres != TMVC_Unchecked); + + switch (visres) { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* Rats, we have to visit the heap to check visibility */ + if (!index_fetch_heap(index_scan, tableslot)) + { + /* + * No visible tuple for this index entry, so we need to + * advance to the next entry. Before doing so, count heap + * page fetches and give up if we've done too many. + * + * We don't charge a page fetch if this is the same heap + * page as the previous tuple. This is on the + * conservative side, since other recently-accessed pages + * are probably still in buffers too; but it's good enough + * for this heuristic. + */ #define VISITED_PAGES_LIMIT 100 - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } + if (block != last_heap_block) + { + last_heap_block = block; + n_visited_heap_pages++; + if (n_visited_heap_pages > VISITED_PAGES_LIMIT) + break; + } - continue; /* no visible tuple, try next index entry */ - } + continue; /* no visible tuple, try next index entry */ + } - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ + /* + * We don't care whether there's more than one visible tuple in + * the HOT chain; if any are visible, that's good enough. + */ + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 39404ec7cdb6..a4bc344381c7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -22,6 +22,7 @@ #include "storage/buffile.h" #include "utils/hsearch.h" #include "access/genam.h" +#include "tableam.h" /* * Maximum number of "halves" a page can be split into in one operation. @@ -124,6 +125,8 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + uint8 visrecheck; /* Cached visibility check result for this + * heap pointer. */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -170,12 +173,24 @@ typedef struct GISTScanOpaqueData BlockNumber curBlkno; /* current number of block */ GistNSN curPageLSN; /* pos in the WAL stream when page was read */ - /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; - OffsetNumber nPageData; /* number of valid items in array */ - OffsetNumber curPageData; /* next item to return */ - MemoryContext pageDataCxt; /* context holding the fetched tuples, for - * index-only scans */ + /* info used by Index-Only Scans */ + Buffer vmbuf; /* reusable buffer for IOS' vm lookups */ + + union { + struct { + /* In a non-ordered search, returnable heap items are stored here: */ + GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nPageData; /* number of valid items in array */ + OffsetNumber curPageData; /* next item to return */ + MemoryContext pageDataCxt; /* context holding the fetched tuples, + * for index-only scans */ + } nos; + struct { + /* In an ordered search, we use this as scratch space */ + GISTSearchHeapItem *sortData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nsortData; /* number of items in sortData */ + } os; + }; } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e48fe434cd39..1b66aa0bacc6 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -368,6 +368,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); +extern void heap_index_vischeck_tuples(Relation rel, + TM_IndexVisibilityCheckOp *checkop); /* in heap/pruneheap.c */ struct GlobalVisState; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..f9ced4a1f0bc 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -957,6 +957,7 @@ typedef struct BTScanPosItem /* what we remember about each match */ ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ + uint8 visrecheck; /* visibility recheck status, if any */ } BTScanPosItem; typedef struct BTScanPosData @@ -1071,6 +1072,11 @@ typedef struct BTScanOpaqueData int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ + /* used for index-only scan visibility prechecks */ + Buffer vmbuf; /* vm buffer */ + int vischeckcap; /* capacity of vischeckbuf */ + TM_VisCheck *vischecksbuf; /* single allocation to save on alloc overhead */ + /* * If we are doing an index-only scan, these are the tuple storage * workspaces for the currPos and markPos respectively. Each is of size diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0a..93a6f65ab0e3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -26,6 +26,9 @@ struct ParallelTableScanDescData; +enum TMVC_Result; + + /* * Generic descriptor for table scans. This is the base-class for table scans, * which needs to be embedded in the scans of individual AMs. @@ -176,6 +179,8 @@ typedef struct IndexScanDescData bool xs_recheck; /* T means scan keys must be rechecked */ + int xs_visrecheck; /* TM_VisCheckResult from tableam.h */ + /* * When fetching with an ordering operator, the values of the ORDER BY * expressions of the last returned tuple, according to the index. If diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index cb43a278f466..63e970468c7d 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -21,6 +21,7 @@ #include "storage/buf.h" #include "utils/geo_decls.h" #include "utils/relcache.h" +#include "tableam.h" typedef struct SpGistOptions @@ -175,7 +176,7 @@ typedef struct SpGistSearchItem bool isLeaf; /* SearchItem is heap item */ bool recheck; /* qual recheck is needed */ bool recheckDistances; /* distance recheck is needed */ - + uint8 visrecheck; /* IOS: TMVC_Result of contained heap tuple */ /* array with numberOfOrderBys entries */ double distances[FLEXIBLE_ARRAY_MEMBER]; } SpGistSearchItem; @@ -223,6 +224,7 @@ typedef struct SpGistScanOpaqueData /* These fields are only used in amgettuple scans: */ bool want_itup; /* are we reconstructing tuples? */ + Buffer vmbuf; /* IOS: used for table_index_vischeck_tuples */ TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ int nPtrs; /* number of TIDs found on current page */ int iPtr; /* index for scanning through same */ @@ -235,6 +237,11 @@ typedef struct SpGistScanOpaqueData /* distances (for recheck) */ IndexOrderByDistance *distances[MaxIndexTuplesPerPage]; + /* support for IOS */ + int nReorderThisPage; + uint8 *visrecheck; /* IOS vis check results, counted by nPtrs */ + SpGistSearchItem **items; /* counted by nReorderThisPage */ + /* * Note: using MaxIndexTuplesPerPage above is a bit hokey since * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb9..47666cf96eaa 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -248,6 +248,63 @@ typedef struct TM_IndexDeleteOp TM_IndexStatus *status; } TM_IndexDeleteOp; +/* + * State used when calling table_index_delete_tuples() + * + * Index-only scans need to know the visibility of the associated table tuples + * before they can return the index tuple. If the index tuple is known to be + * visible with a cheap check, we can return it directly without requesting + * the visibility info from the table AM directly. + * + * This AM API exposes a cheap visibility checking API to indexes, allowing + * these indexes to check multiple tuples worth of visibility info at once, + * and allowing the AM to store these checks, improving the pinning ergonomics + * of index AMs by allowing a scan to cache index tuples in memory without + * holding pins on index tuples' pages until the index tuples were returned. + * + * The AM is called with a list of TIDs, and its output will indicate the + * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible. + * + * HeapAM's implementation of visibility maps only allows for cheap checks of + * *definitely visible*; all other results are *maybe visible*. A result for + * *definitely not visible* aka dead is currently not accounted for by lack of + * Table AMs which support such visibility lookups cheaply. + */ +typedef enum TMVC_Result +{ + TMVC_Unchecked, + TMVC_Visible, + TMVC_MaybeVisible, +} TMVC_Result; + +typedef struct TM_VisCheck +{ + /* table TID from index tuple */ + BlockNumber tidblkno; + uint16 tidoffset; + /* identifier for the TID in this visibility check operation context */ + OffsetNumber idxoffnum; + /* the result of the visibility check operation */ + TMVC_Result vischeckresult; +} TM_VisCheck; + +static inline void +PopulateTMVischeck(TM_VisCheck *check, ItemPointer tid, OffsetNumber idxoff) +{ + Assert(ItemPointerIsValid(tid)); + check->tidblkno = ItemPointerGetBlockNumberNoCheck(tid); + check->tidoffset = ItemPointerGetOffsetNumberNoCheck(tid); + check->idxoffnum = idxoff; + check->vischeckresult = TMVC_Unchecked; +} + +typedef struct TM_IndexVisibilityCheckOp +{ + int checkntids; /* number of TIDs to check */ + Buffer *vmbuf; /* pointer to VM buffer to reuse across calls */ + TM_VisCheck *checktids; /* the checks to execute */ +} TM_IndexVisibilityCheckOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -494,6 +551,10 @@ typedef struct TableAmRoutine TransactionId (*index_delete_tuples) (Relation rel, TM_IndexDeleteOp *delstate); + /* see table_index_vischeck_tuples() */ + void (*index_vischeck_tuples) (Relation rel, + TM_IndexVisibilityCheckOp *checkop); + /* ------------------------------------------------------------------------ * Manipulations of physical tuples. @@ -1318,6 +1379,48 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) return rel->rd_tableam->index_delete_tuples(rel, delstate); } +/* + * Determine rough visibility information of index tuples based on each TID. + * + * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp + * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO + * overhead. For the heap AM, the implementation is effectively a wrapper + * around VM_ALL_FROZEN. + * + * On return, all TM_VisChecks indicated by checkop->checktids will have been + * updated with the correct visibility status. + * + * Note that there is no value for "definitely dead" tuples, as the Heap AM + * doesn't have an efficient method to determine that a tuple is dead to all + * users, as it would have to go into the heap. If and when AMs are built + * that would support VM checks with an equivalent to VM_ALL_DEAD this + * decision can be reconsidered. + */ +static inline void +table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + return rel->rd_tableam->index_vischeck_tuples(rel, checkop); +} + +static inline TMVC_Result +table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid) +{ + TM_IndexVisibilityCheckOp checkOp; + TM_VisCheck op; + + PopulateTMVischeck(&op, tid, 0); + + checkOp.checktids = &op; + checkOp.checkntids = 1; + checkOp.vmbuf = vmbuffer; + + rel->rd_tableam->index_vischeck_tuples(rel, &checkOp); + + Assert(op.vischeckresult != TMVC_Unchecked); + + return op.vischeckresult; +} + /* ---------------------------------------------------------------------------- * Functions for manipulations of physical tuples. diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h index 5ad5c0208779..c75303f63fd4 100644 --- a/src/include/access/visibilitymapdefs.h +++ b/src/include/access/visibilitymapdefs.h @@ -12,6 +12,7 @@ */ #ifndef VISIBILITYMAPDEFS_H #define VISIBILITYMAPDEFS_H +#include "storage/bufpage.h" /* Number of bits for one heap page */ #define BITS_PER_HEAPBLOCK 2 @@ -31,4 +32,22 @@ #define VISIBILITYMAP_XLOG_CATALOG_REL 0x04 #define VISIBILITYMAP_XLOG_VALID_BITS (VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL) +/* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus the standard page header is + * used for the bitmap. + */ +#define VM_MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) + +/* Number of heap blocks we can represent in one byte */ +#define VM_HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +/* Number of heap blocks we can represent in one visibility map page. */ +#define VM_HEAPBLOCKS_PER_PAGE (VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE) + +/* Mapping from heap block number to the right bit in the visibility map */ +#define HEAPBLK_TO_VMBLOCK(x) ((x) / VM_HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_VMBYTE(x) (((x) % VM_HEAPBLOCKS_PER_PAGE) / VM_HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_VMOFFSET(x) (((x) % VM_HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) + #endif /* VISIBILITYMAPDEFS_H */ diff --git a/src/test/isolation/expected/index-only-scan-btree-vacuum.out b/src/test/isolation/expected/index-only-scan-btree-vacuum.out new file mode 100644 index 000000000000..9a9d94c86f6f --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-btree-vacuum.out @@ -0,0 +1,59 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted_asc s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; + +step s1_begin: BEGIN; +step s1_prepare_sorted_asc: + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a ASC; + +step s1_fetch_1: + FETCH FROM foo; + +x +- +1 +(1 row) + +step s2_vacuum: + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; + +step s1_fetch_all: + FETCH ALL FROM foo; + + x +-- +10 +(1 row) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_sorted_desc s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; + +step s1_begin: BEGIN; +step s1_prepare_sorted_desc: + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a DESC; + +step s1_fetch_1: + FETCH FROM foo; + + x +-- +10 +(1 row) + +step s2_vacuum: + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; + +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +1 +(1 row) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-gist-vacuum.out b/src/test/isolation/expected/index-only-scan-gist-vacuum.out new file mode 100644 index 000000000000..b7c02ee95292 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-gist-vacuum.out @@ -0,0 +1,53 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +a +- +(0 rows) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-spgist-vacuum.out b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out new file mode 100644 index 000000000000..b7c02ee95292 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out @@ -0,0 +1,53 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +a +- +(0 rows) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index e3c669a29c7a..69909d4d9118 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -18,6 +18,9 @@ test: two-ids test: multiple-row-versions test: index-only-scan test: index-only-bitmapscan +test: index-only-scan-btree-vacuum +test: index-only-scan-gist-vacuum +test: index-only-scan-spgist-vacuum test: predicate-lock-hot-tuple test: update-conflict-out test: deadlock-simple diff --git a/src/test/isolation/specs/index-only-scan-btree-vacuum.spec b/src/test/isolation/specs/index-only-scan-btree-vacuum.spec new file mode 100644 index 000000000000..9a00804c2c57 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-btree-vacuum.spec @@ -0,0 +1,113 @@ +# index-only-scan test showing correct results with btree even with concurrent +# vacuum + +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a int NOT NULL, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_btree_a ON ios_needs_cleanup_lock USING btree (a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted_asc { + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a ASC; +} +step s1_prepare_sorted_desc { + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a DESC; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1, nor 10, so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum +{ + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +} + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted_asc + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted_desc + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # If the index scan doesn't correctly interlock its visibility tests with + # concurrent VACUUM cleanup then VACUUM will mark pages as all-visible that + # the scan in the next steps may then consider all-visible, despite some of + # those rows having been removed. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit diff --git a/src/test/isolation/specs/index-only-scan-gist-vacuum.spec b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec new file mode 100644 index 000000000000..9d241b259200 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec @@ -0,0 +1,112 @@ +# index-only-scan test showing wrong results with GiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING gist(a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit diff --git a/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec new file mode 100644 index 000000000000..cd621d4f7f20 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec @@ -0,0 +1,112 @@ +# index-only-scan test showing wrong results with SPGiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING spgist(a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit