diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 387d99723453..8b89629d2320 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -63,11 +63,7 @@ gistkillitems(IndexScanDesc scan) * safe. */ if (BufferGetLSNAtomic(buffer) != so->curPageLSN) - { - UnlockReleaseBuffer(buffer); - so->numKilled = 0; /* reset counter */ - return; - } + goto unlock; Assert(GistPageIsLeaf(page)); @@ -77,6 +73,12 @@ gistkillitems(IndexScanDesc scan) */ for (i = 0; i < so->numKilled; i++) { + if (!killedsomething) + { + if (!BufferPrepareToSetHintBits(buffer)) + goto unlock; + } + offnum = so->killedItems[i]; iid = PageGetItemId(page, offnum); ItemIdMarkDead(iid); @@ -86,9 +88,10 @@ gistkillitems(IndexScanDesc scan) if (killedsomething) { GistMarkPageHasGarbage(page); - MarkBufferDirtyHint(buffer, true); + BufferFinishSetHintBits(buffer, true, true); } +unlock: UnlockReleaseBuffer(buffer); /* diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index b8e5bd005e59..dd17eff59d1d 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1029,7 +1029,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) zerobuf.data, true); - PageSetChecksumInplace(page, lastblock); + PageSetChecksum(page, lastblock); smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data, false); diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 66c39f606540..cb004432388b 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -593,6 +593,9 @@ _hash_kill_items(IndexScanDesc scan) if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) { + if (!BufferPrepareToSetHintBits(so->currPos.buf)) + goto unlock_page; + /* found the item */ ItemIdMarkDead(iid); killedsomething = true; @@ -610,9 +613,10 @@ _hash_kill_items(IndexScanDesc scan) if (killedsomething) { opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; - MarkBufferDirtyHint(buf, true); + BufferFinishSetHintBits(so->currPos.buf, true, true); } +unlock_page: if (so->hashso_bucket_buf == so->currPos.buf || havePin) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index cedaa195cb6f..1621f055822f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -508,42 +508,106 @@ page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, BlockNumber block, int lines, bool all_visible, bool check_serializable) { + Oid relid = RelationGetRelid(scan->rs_base.rs_rd); +#ifdef BATCHMVCC_FEWER_ARGS + BatchMVCCState batchmvcc; +#else + HeapTupleData tuples[MaxHeapTuplesPerPage]; + bool visible[MaxHeapTuplesPerPage]; +#endif int ntup = 0; - OffsetNumber lineoff; + int nvis = 0; + + /* page at a time should have been disabled otherwise */ + Assert(IsMVCCSnapshot(snapshot)); - for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) + /* first find all tuples on the page */ + for (OffsetNumber lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) { ItemId lpp = PageGetItemId(page, lineoff); - HeapTupleData loctup; - bool valid; + HeapTuple tup; - if (!ItemIdIsNormal(lpp)) + if (unlikely(!ItemIdIsNormal(lpp))) continue; - loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); - loctup.t_len = ItemIdGetLength(lpp); - loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); - ItemPointerSet(&(loctup.t_self), block, lineoff); - - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + /* + * If the page is not all-visible or we need to check serializability, + * maintain enough state to be able to refind the tuple efficiently, + * without again needing to extract it from the page. + */ + if (!all_visible || check_serializable) + { +#ifdef BATCHMVCC_FEWER_ARGS + tup = &batchmvcc.tuples[ntup]; +#else + tup = &tuples[ntup]; +#endif - if (check_serializable) - HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, - &loctup, buffer, snapshot); + tup->t_data = (HeapTupleHeader) PageGetItem(page, lpp); + tup->t_len = ItemIdGetLength(lpp); + tup->t_tableOid = relid; + ItemPointerSet(&(tup->t_self), block, lineoff); + } - if (valid) + /* + * If the page is all visible, these fields won'otherwise wont be + * populated in loop below. + */ + if (all_visible) { + if (check_serializable) + { +#ifdef BATCHMVCC_FEWER_ARGS + batchmvcc.visible[ntup] = true; +#else + visible[ntup] = true; +#endif + } scan->rs_vistuples[ntup] = lineoff; - ntup++; } + + ntup++; } Assert(ntup <= MaxHeapTuplesPerPage); - return ntup; + /* unless the page is all visible, test visibility for all tuples one go */ + if (all_visible) + nvis = ntup; + else + nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer, + ntup, +#ifdef BATCHMVCC_FEWER_ARGS + &batchmvcc, +#else + tuples, visible, +#endif + scan->rs_vistuples + ); + + /* + * So far we don't have batch API for testing serializabilty, so do so + * one-by-one. + */ + if (check_serializable) + { + for (int i = 0; i < ntup; i++) + { +#ifdef BATCHMVCC_FEWER_ARGS + HeapCheckForSerializableConflictOut(batchmvcc.visible[i], + scan->rs_base.rs_rd, + &batchmvcc.tuples[i], + buffer, snapshot); +#else + HeapCheckForSerializableConflictOut(visible[i], + scan->rs_base.rs_rd, + &tuples[i], + buffer, snapshot); +#endif + } + } + + return nvis; } /* diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 24d3765aa201..d21d7d4cef7f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -837,7 +837,18 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tuple = ExecFetchSlotHeapTuple(slot, false, NULL); buf = hslot->buffer; - LockBuffer(buf, BUFFER_LOCK_SHARE); + /* + * To be able to guarantee that we can set the hint bit, acquire an + * exclusive lock on the old buffer. We need the hint bits to be set + * as otherwise reform_and_rewrite_tuple() -> rewrite_heap_tuple() -> + * heap_freeze_tuple() will get confused. + * + * It'd be better if we somehow could avoid setting hint bits on the + * old page. One reason to use VACUUM FULL are very bloated tables - + * rewriting most of the old table before during VACUUM FULL doesn't + * exactly help... + */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) { diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 05f6946fe60d..db5b9c7cb5f0 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -80,10 +80,35 @@ /* - * SetHintBits() + * To be allowed to set hint bits SetHintBits() needs to call + * BufferPrepareToSetHintBits(). However, that's not free, and some callsites + * call SetHintBits() on numerous tuples in a row. For those it makes sense to + * amortize the cost of BufferPrepareToSetHintBits(). Additionally it's + * desirable to defer the cost of BufferPrepareToSetHintBits() until a hint + * bit needs to actually be set. This enum serves as the necessary state space + * passed to SetHintbitsExt(). + */ +typedef enum SetHintBitsState +{ + /* not yet checked if hint bits may be set */ + SHB_INITIAL, + /* failed to get permission to set hint bits, don't check again */ + SHB_DISABLED, + /* allowed to set hint bits */ + SHB_ENABLED, +} SetHintBitsState; + +/* + * SetHintBitsExt() * * Set commit/abort hint bits on a tuple, if appropriate at this time. * + * To be allowed to set a hint bit on a tuple, the page must not be undergoing + * IO at this time (otherwise we e.g. could corrupt PG's page checksum or even + * the filesystem's, as is known to happen with btrfs). The right to set a + * hint bit is acquired on a page level with BufferPrepareToSetHintBits(). + * Only a single backend gets the right to set hint bits at a time. + * * It is only safe to set a transaction-committed hint bit if we know the * transaction's commit record is guaranteed to be flushed to disk before the * buffer, or if the table is temporary or unlogged and will be obliterated by @@ -111,9 +136,16 @@ * InvalidTransactionId if no check is needed. */ static inline void -SetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid) +SetHintBitsExt(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid, SetHintBitsState *state) { + /* + * In batched mode and we previously did not get permission to set hint + * bits. Don't try again, in all likelihood IO is still going on. + */ + if (state && *state == SHB_DISABLED) + return; + if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ @@ -127,8 +159,50 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } } + if (!state) + { + BufferSetHintBits16(buffer, + &tuple->t_infomask, + tuple->t_infomask | infomask); + return; + } + + /* + * If not batching or this is the first hint that we'd like to set on the + * page, check if we are allowed to do so. + */ + if (*state == SHB_INITIAL) + { + if (BufferPrepareToSetHintBits(buffer)) + { + *state = SHB_ENABLED; + } + else + { + /* + * If we hold an exclusive lock nobody else should be able to + * prevent us from setting hint bits. This is important as there + * are a few hint bit sets that are important for correctness - + * all those happen with the buffer exclusively locked. + */ + Assert(!BufferLockHeldByMe(buffer, BUFFER_LOCK_EXCLUSIVE)); + *state = SHB_DISABLED; + return; + } + } + tuple->t_infomask |= infomask; - MarkBufferDirtyHint(buffer, true); +} + +/* + * Simple wrapper around SetHintBitExt(), use when operating on a single + * tuple. + */ +static inline void +SetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + SetHintBitsExt(tuple, buffer, infomask, xid, NULL); } /* @@ -141,9 +215,65 @@ void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid) { + /* + * The uses from heapam.c rely on being able to perform the hint bit + * updates, which can only be guaranteed if we are holding an exclusive + * lock on the buffer - which all callers are doing. + */ + Assert(BufferIsLocal(buffer) || BufferLockHeldByMe(buffer, BUFFER_LOCK_EXCLUSIVE)); + SetHintBits(tuple, buffer, infomask, xid); } +/* + * If HEAP_MOVED_OFF or HEAP_MOVED_IN are set on the tuple, remove them and + * adjust hint bits. See the comment for SetHintBits() for more background. + * + * This helper returns false if the row ought to be invisible, true otherwise. + */ +static inline bool +HeapTupleCleanMoved(HeapTupleHeader tuple, Buffer buffer) +{ + TransactionId xvac; + + /* only used by pre-9.0 binary upgrades */ + if (likely(!(tuple->t_infomask & (HEAP_MOVED_OFF | HEAP_MOVED_IN)))) + return true; + + xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + elog(ERROR, "encountered tuple with HEAP_MOVED considered current"); + + if (TransactionIdIsInProgress(xvac)) + elog(ERROR, "encountered tuple with HEAP_MOVED considered in-progress"); + + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + return true; +} /* * HeapTupleSatisfiesSelf @@ -179,45 +309,8 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -372,45 +465,8 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; /* * An invalid Xmin can be left behind by a speculative insertion that @@ -468,45 +524,8 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, if (HeapTupleHeaderXminInvalid(tuple)) return TM_Invisible; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return TM_Invisible; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return TM_Invisible; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - } - } + else if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= curcid) @@ -756,45 +775,8 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -956,9 +938,9 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * inserting/deleting transaction was still running --- which was more cycles * and more contention on ProcArrayLock. */ -static bool +static inline bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, - Buffer buffer) + Buffer buffer, SetHintBitsState *state) { HeapTupleHeader tuple = htup->t_data; @@ -979,45 +961,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (HeapTupleHeaderXminInvalid(tuple)) return false; - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!XidInMVCCSnapshot(xvac, snapshot)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshot(xvac, snapshot)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (!HeapTupleCleanMoved(tuple, buffer)) + return false; else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) @@ -1050,8 +995,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) { /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } @@ -1063,13 +1008,13 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) return false; else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple), state); else { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId, state); return false; } } @@ -1132,14 +1077,14 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple), state); } else { @@ -1222,57 +1167,8 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de { if (HeapTupleHeaderXminInvalid(tuple)) return HEAPTUPLE_DEAD; - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return HEAPTUPLE_INSERT_IN_PROGRESS; - /* only locked? run infomask-only check first, for performance */ - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - /* inserted and then deleted by same xact */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple))) - return HEAPTUPLE_DELETE_IN_PROGRESS; - /* deleting subtransaction must have aborted */ - return HEAPTUPLE_INSERT_IN_PROGRESS; - } + else if (!HeapTupleCleanMoved(tuple, buffer)) + return HEAPTUPLE_DEAD; else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) { /* @@ -1762,6 +1658,60 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, return true; } +/* + * Perform HeaptupleSatisfiesMVCC() on each passed in tuple. This is more + * efficient than doing HeapTupleSatisfiesMVCC() one-by-one. + * + * To be checked tuples are passed via BatchMVCCState->tuples. Each tuple's + * visibility is set in batchmvcc->visible[]. In addition, ->vistuples_dense + * is set to contain the offsets of visible tuples. + * + * Returns the number of visible tuples. + */ +int +HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, + int ntups, +#ifdef BATCHMVCC_FEWER_ARGS + BatchMVCCState *batchmvcc, +#else + HeapTupleData *tuples, + bool *visible, +#endif + OffsetNumber *vistuples_dense) +{ + int nvis = 0; + SetHintBitsState state = SHB_INITIAL; + + Assert(IsMVCCSnapshot(snapshot)); + + for (int i = 0; i < ntups; i++) + { + bool valid; +#ifdef BATCHMVCC_FEWER_ARGS + HeapTuple tup = &batchmvcc->tuples[i]; +#else + HeapTuple tup = &tuples[i]; +#endif + + valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer, &state); +#ifdef BATCHMVCC_FEWER_ARGS + batchmvcc->visible[i] = valid; +#else + visible[i] = valid; +#endif + if (likely(valid)) + { + vistuples_dense[nvis] = tup->t_self.ip_posid; + nvis++; + } + } + + if (state == SHB_ENABLED) + BufferFinishSetHintBits(buffer, true, true); + + return nvis; +} + /* * HeapTupleSatisfiesVisibility * True iff heap tuple satisfies a time qual. @@ -1778,7 +1728,7 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) switch (snapshot->snapshot_type) { case SNAPSHOT_MVCC: - return HeapTupleSatisfiesMVCC(htup, snapshot, buffer); + return HeapTupleSatisfiesMVCC(htup, snapshot, buffer, NULL); case SNAPSHOT_SELF: return HeapTupleSatisfiesSelf(htup, snapshot, buffer); case SNAPSHOT_ANY: diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index aa82cede30aa..30c8b64564a6 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -679,20 +679,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, { /* * The conflicting tuple (or all HOT chains pointed to by - * all posting list TIDs) is dead to everyone, so mark the - * index entry killed. + * all posting list TIDs) is dead to everyone, so try to + * mark the index entry killed. It's ok if we're not + * allowed to, this isn't required for correctness. */ - ItemIdMarkDead(curitemid); - opaque->btpo_flags |= BTP_HAS_GARBAGE; + Buffer buf; - /* - * Mark buffer with a dirty hint, since state is not - * crucial. Be sure to mark the proper buffer dirty. - */ + /* Be sure to operate on the proper buffer */ if (nbuf != InvalidBuffer) - MarkBufferDirtyHint(nbuf, true); + buf = nbuf; else - MarkBufferDirtyHint(insertstate->buf, true); + buf = insertstate->buf; + + if (BufferPrepareToSetHintBits(buf)) + { + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + + /* + * Mark buffer with a dirty hint, since state is not + * crucial. + */ + BufferFinishSetHintBits(buf, true, true); + } } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2aee9bbf67d2..53ad7b5017df 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2492,6 +2492,10 @@ _bt_killitems(IndexScanDesc scan) */ if (killtuple && !ItemIdIsDead(iid)) { + /* if we're not able to set hint bits, there's no point continuing */ + if (!killedsomething && !BufferPrepareToSetHintBits(so->currPos.buf)) + goto unlock_page; + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; @@ -2511,9 +2515,10 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + BufferFinishSetHintBits(so->currPos.buf, true, true); } +unlock_page: _bt_unlockbuf(scan->indexRelation, so->currPos.buf); } diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028ea..db5e0ec213b7 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1052,6 +1052,8 @@ XLogCheckBufferNeedsBackup(Buffer buffer) * failures. So instead we copy the page and insert the copied data as normal * record data. * + * FIXME: outdated + * * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. @@ -1089,37 +1091,23 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) if (lsn <= RedoRecPtr) { int flags = 0; - PGAlignedBlock copied_buffer; char *origdata = (char *) BufferGetBlock(buffer); RelFileLocator rlocator; ForkNumber forkno; BlockNumber blkno; - /* - * Copy buffer so we don't have to worry about concurrent hint bit or - * lsn updates. We assume pd_lower/upper cannot be changed without an - * exclusive lock, so the contents bkp are not racy. - */ - if (buffer_std) - { - /* Assume we can omit data between pd_lower and pd_upper */ - Page page = BufferGetPage(buffer); - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - memcpy(copied_buffer.data, origdata, lower); - memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); - } - else - memcpy(copied_buffer.data, origdata, BLCKSZ); - XLogBeginInsert(); if (buffer_std) flags |= REGBUF_STANDARD; BufferGetTag(buffer, &rlocator, &forkno, &blkno); - XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, flags); + + /* + * XXX: Should probably be XLogRegisterBuffer(), but assertions make + * that non-viable right now. + */ + XLogRegisterBlock(0, &rlocator, forkno, blkno, origdata, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); } diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 4a7853d13fac..7442ade26088 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -388,6 +388,17 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) sprintf(cmd, "%d", MyIoWorkerId); set_ps_display(cmd); + /* + * Allow access to all buffers. We assume that backends will not request + * I/Os on shared buffers without holding proper locks. + */ +#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ) + for (int i = 0; i < NBuffers; i++) + { + SetBufferProtection(i + 1, true, true); + } +#endif + /* see PostgresMain() */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660cc..c3249907e6b8 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -58,7 +58,8 @@ tuple while they are doing visibility checks. 4. It is considered OK to update tuple commit status bits (ie, OR the values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and -pin on a buffer. This is OK because another backend looking at the tuple +pin on a buffer after calling BufferPrepareToSetHintBits(). +This is OK because another backend looking at the tuple at about the same time would OR the same bits into the field, so there is little or no risk of conflicting update; what's more, if there did manage to be a conflict it would merely mean that one bit-update would @@ -68,6 +69,9 @@ great harm is done if they get reset to zero by conflicting updates. Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires an exclusive buffer lock (and it must also be WAL-logged). +Calling BufferPrepareToSetHintBits() ensures that write IO cannot happen at +the same time as modifying hint bits, which can lead the checksum computed at +the start of the write to not match anymore. 5. To physically remove a tuple or compact free space on a page, one must hold a pin and an exclusive lock, *and* observe while holding the diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b..e5e22cfc59dd 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -138,6 +138,16 @@ BufferManagerShmemInit(void) LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + + /* + * Unused buffers are inaccessible. But if we're not enforcing + * making buffers inaccessible without a pin, we won't mark + * buffers as accessible during pinning, therefore we better don't + * make them initially inaccessible. + */ +#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ) + SetBufferProtection(i + 1, false, false); +#endif } /* Correct last entry of linked list */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f9681d09e1e7..d680b57bcb70 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -37,6 +37,10 @@ #include #include +#ifdef ENFORCE_BUFFER_PROT +#include /* for mprotect() */ +#endif /* ENFORCE_BUFFER_PROT */ + #include "access/tableam.h" #include "access/xloginsert.h" #include "access/xlogutils.h" @@ -54,6 +58,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -228,6 +233,8 @@ static void ResOwnerReleaseBufferIO(Datum res); static char *ResOwnerPrintBufferIO(Datum res); static void ResOwnerReleaseBufferPin(Datum res); static char *ResOwnerPrintBufferPin(Datum res); +static void ResOwnerReleaseBufferSettingHints(Datum res); +static char *ResOwnerPrintBufferSettingHints(Datum res); const ResourceOwnerDesc buffer_io_resowner_desc = { @@ -247,6 +254,15 @@ const ResourceOwnerDesc buffer_pin_resowner_desc = .DebugPrint = ResOwnerPrintBufferPin }; +const ResourceOwnerDesc buffer_setting_hints_resowner_desc = +{ + .name = "buffer setting hints", + .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, + .release_priority = RELEASE_PRIO_BUFFER_IOS, + .ReleaseResource = ResOwnerReleaseBufferSettingHints, + .DebugPrint = ResOwnerPrintBufferSettingHints +}; + /* * Ensure that the PrivateRefCountArray has sufficient space to store one more * entry. This has to be called before using NewPrivateRefCountEntry() to fill @@ -1063,8 +1079,6 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) if (need_to_zero) { - memset(BufferGetPage(buffer), 0, BLCKSZ); - /* * Grab the buffer content lock before marking the page as valid, to * make sure that no other backend sees the zeroed page before the @@ -1077,7 +1091,12 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) * already valid.) */ if (!isLocalBuf) + { LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + SetBufferProtection(buffer, true, true); + } + + memset(BufferGetPage(buffer), 0, BLCKSZ); /* Set BM_VALID, terminate IO, and wake up any waiters */ if (isLocalBuf) @@ -1911,6 +1930,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_pages[0] = BufferGetBlock(buffers[nblocks_done]); io_buffers_len = 1; + if (persistence != RELPERSISTENCE_TEMP) + SetBufferProtection(io_buffers[0], true, true); + /* * How many neighboring-on-disk blocks can we scatter-read into other * buffers at the same time? In this case we don't wait if we see an @@ -1926,7 +1948,16 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) BufferGetBlockNumber(buffers[i]) - 1); Assert(io_buffers[io_buffers_len] == buffers[i]); - io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]); + io_pages[io_buffers_len] = BufferGetBlock(buffers[i]); + + /* + * If the I/O is performed synchronously, allow the synchronous + * read to modify the buffer. + */ + if (persistence != RELPERSISTENCE_TEMP) + SetBufferProtection(io_buffers[io_buffers_len], true, true); + + io_buffers_len++; } /* get a reference to wait for in WaitReadBuffers() */ @@ -1964,6 +1995,16 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) else pgBufferUsage.shared_blks_read += io_buffers_len; + /* + * Disallow writes for this proces again. If the I/O is performed + * asynchronously, the I/O worker will allow the write for itself. + */ + if (persistence != RELPERSISTENCE_TEMP) + { + for (int i = 0; i < io_buffers_len; i++) + SetBufferProtection(io_buffers[i], true, false); + } + /* * Track vacuum cost when issuing IO, not after waiting for it. * Otherwise we could end up issuing a lot of IO in a short timespan, @@ -2133,7 +2174,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* some sanity checks while we hold the buffer header lock */ Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1); - Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS))); + Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY + | BM_IO_IN_PROGRESS | BM_SETTING_HINTS))); victim_buf_hdr->tag = newTag; @@ -2381,9 +2423,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * If the buffer was dirty, try to write it out. There is a race * condition here, in that someone might dirty it after we released the - * buffer header lock above, or even while we are writing it out (since - * our share-lock won't prevent hint-bit updates). We will recheck the - * dirty bit after re-locking the buffer header. + * buffer header lock above. We will recheck the dirty bit after + * re-locking the buffer header. */ if (buf_state & BM_DIRTY) { @@ -2489,7 +2530,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) buf_state = pg_atomic_read_u32(&buf_hdr->state); Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); - Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY))); + Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY + | BM_IO_IN_PROGRESS | BM_SETTING_HINTS))); CheckBufferIsPinnedOnce(buf); #endif @@ -2637,7 +2679,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1)); /* new buffers are zero-filled */ + SetBufferProtection(buffers[i], true, true); MemSet(buf_block, 0, BLCKSZ); + SetBufferProtection(buffers[i], true, false); } /* @@ -2865,7 +2909,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, } if (lock) - LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } @@ -3124,6 +3168,10 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) * non-accessible in any case. */ VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + +#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ) + SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false); +#endif break; } } @@ -3196,6 +3244,10 @@ PinBuffer_Locked(BufferDesc *buf) */ VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); +#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ) + SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false); +#endif + /* * Since we hold the buffer spinlock, we can update the buffer state and * release the lock in one operation. @@ -3293,6 +3345,10 @@ UnpinBufferNoOwner(BufferDesc *buf) */ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); +#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ) + SetBufferProtection(BufferDescriptorGetBuffer(buf), false, false); +#endif + /* I'd better not still hold the buffer content lock */ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); @@ -4101,6 +4157,69 @@ CheckForBufferLeaks(void) #endif } +/* + * To verify that we are following buffer locking rules, we can make pages + * inaccessible or read-only when we don't have sufficient locks etc. + * + * XXX: It might be possible to fold the VALGRIND_MAKE_MEM_NOACCESS() / + * VALGRIND_MAKE_MEM_DEFINED() calls into this. + */ +#ifdef ENFORCE_BUFFER_PROT +void +SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes) +{ + static long pagesz = 0; + int prot = PROT_NONE; + int rc; + + Assert(huge_pages_status != HUGE_PAGES_UNKNOWN && + huge_pages_status != HUGE_PAGES_TRY); + StaticAssertStmt(PROT_NONE == 0, "that can't be right"); + + if (unlikely(pagesz == 0)) + { + pagesz = sysconf(_SC_PAGESIZE); + + elog(DEBUG1, "sysconf(_SC_PAGESIZE) = %ld", pagesz); + if (pagesz == -1) + { + elog(ERROR, "sysconf(_SC_PAGESIZE) failed: %m"); + } + else if (pagesz > BLCKSZ) + { + elog(DEBUG1, "pagesz > BLCKSZ, disabling buffer protection mode"); + pagesz = -1; + } + else if(BLCKSZ % pagesz != 0) + { + elog(DEBUG1, "BLCKSZ %% pagesz != 0, disabling buffer protection mode"); + pagesz = -1; + } + else if (huge_pages_status == HUGE_PAGES_ON) + { + /* can't set status in a granular enough way */ + elog(DEBUG1, "huge pages enabled, disabling buffer protection mode"); + pagesz = -1; + } + } + + /* disabled */ + if (pagesz == -1) + return; + + if (allow_reads) + prot |= PROT_READ; + + if (allow_writes) + prot |= PROT_WRITE; + + rc = mprotect(BufferGetBlock(buf), BLCKSZ, prot); + + if (rc != 0) + elog(ERROR, "mprotect(%d, %d) failed: %m", buf, prot); +} +#endif /* ENFORCE_BUFFER_PROT */ + /* * Helper routine to issue warnings when a buffer is unexpectedly pinned */ @@ -4214,9 +4333,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, * * The caller must hold a pin on the buffer and have share-locked the * buffer contents. (Note: a share-lock does not prevent updates of - * hint bits in the buffer, so the page could change while the write - * is in progress, but we assume that that will not invalidate the data - * written.) + * hint bits in the buffer, but hint bits are not allowed to be set after + * StartBufferIO() succeeded.) * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. @@ -4229,7 +4347,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, ErrorContextCallback errcallback; instr_time io_start; Block bufBlock; - char *bufToWrite; uint32 buf_state; /* @@ -4296,12 +4413,11 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, */ bufBlock = BufHdrGetBlock(buf); - /* - * Update page checksum if desired. Since we have only shared lock on the - * buffer, other processes might be updating hint bits in it, so we must - * copy the page to private storage if we do checksumming. - */ - bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); + /* Update page checksum if desired. */ + SetBufferProtection(BufferDescriptorGetBuffer(buf), true, true); + PageSetChecksum((Page) bufBlock, buf->tag.blockNum); + /* FIXME: could theoretically be exclusively locked */ + SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false); io_start = pgstat_prepare_io_time(track_io_timing); @@ -4311,7 +4427,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, smgrwrite(reln, BufTagGetForkNum(&buf->tag), buf->tag.blockNum, - bufToWrite, + bufBlock, false); /* @@ -5341,40 +5457,40 @@ IncrBufferRefCount(Buffer buffer) } /* - * MarkBufferDirtyHint - * - * Mark a buffer dirty for non-critical changes. - * - * This is essentially the same as MarkBufferDirty, except: - * - * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. - * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. - * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * Implementation of MarkBufferDirtyHint(), separate because + * BufferSetHintBits16() needs to be able to set hint bits without setting + * BM_SETTING_HINTS. */ -void -MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +static inline void +MarkBufferDirtyHintImpl(Buffer buffer, bool buffer_std, bool single_hint_write) { BufferDesc *bufHdr; Page page = BufferGetPage(buffer); - if (!BufferIsValid(buffer)) - elog(ERROR, "bad buffer ID: %d", buffer); - - if (BufferIsLocal(buffer)) - { - MarkLocalBufferDirty(buffer); - return; - } - bufHdr = GetBufferDescriptor(buffer - 1); Assert(GetPrivateRefCount(buffer) > 0); /* here, either share or exclusive lock is OK */ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + /* + * A share-locked buffer may only be marked dirty for hints after having + * gotten the permission to do so with BufferPrepareToSetHintBits(). It'd + * perhaps be cheap enough to test this even outside of assert enabled + * builds, but LWLockHeldByMeInMode() says "debug support only". + */ +#ifdef USE_ASSERT_CHECKING + if (!single_hint_write) + { + uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); + + if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE)) + { + Assert(buf_state & BM_SETTING_HINTS); + } + } +#endif + /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many @@ -5487,6 +5603,35 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) } } +/* + * MarkBufferDirtyHint + * + * Mark a buffer dirty for non-critical changes. + * + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * 2. The caller might have only share-lock instead of exclusive-lock on the + * buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (due to a race condition), so it cannot be used for important changes. + */ +void +MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +{ + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + MarkBufferDirtyHintImpl(buffer, buffer_std, false); +} + /* * Release buffer content locks for shared buffers. * @@ -5528,19 +5673,38 @@ void LockBuffer(Buffer buffer, int mode) { BufferDesc *buf; + LWLock *content_lock; Assert(BufferIsPinned(buffer)); if (BufferIsLocal(buffer)) return; /* local buffers need no lock */ buf = GetBufferDescriptor(buffer - 1); + content_lock = BufferDescriptorGetContentLock(buf); if (mode == BUFFER_LOCK_UNLOCK) - LWLockRelease(BufferDescriptorGetContentLock(buf)); + { +#ifdef ENFORCE_BUFFER_PROT + bool was_exclusive; + + was_exclusive = LWLockHeldByMeInMode(content_lock, LW_EXCLUSIVE); +#endif /* ENFORCE_BUFFER_PROT */ + + LWLockRelease(content_lock); + +#ifdef ENFORCE_BUFFER_PROT + if (was_exclusive) + SetBufferProtection(buffer, true, false); +#endif /* ENFORCE_BUFFER_PROT */ + } else if (mode == BUFFER_LOCK_SHARE) - LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); + LWLockAcquire(content_lock, LW_SHARED); else if (mode == BUFFER_LOCK_EXCLUSIVE) - LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); + { + LWLockAcquire(content_lock, LW_EXCLUSIVE); + + SetBufferProtection(buffer, true, true); + } else elog(ERROR, "unrecognized buffer lock mode: %d", mode); } @@ -5554,6 +5718,7 @@ bool ConditionalLockBuffer(Buffer buffer) { BufferDesc *buf; + bool ret; Assert(BufferIsPinned(buffer)); if (BufferIsLocal(buffer)) @@ -5561,8 +5726,13 @@ ConditionalLockBuffer(Buffer buffer) buf = GetBufferDescriptor(buffer - 1); - return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), - LW_EXCLUSIVE); + ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE); + + if (ret) + SetBufferProtection(buffer, true, true); + + return ret; } /* @@ -5873,6 +6043,47 @@ IsBufferCleanupOK(Buffer buffer) return false; } +/* + * BufferLockHeldByMe - does the backend have the buffer locked? + * + * This likely should only be used for asserts etc. + * + * Note that this can only be called for non-temp buffers - there is no + * correct value to return for temporary buffers. One might think that just + * returning true for temp buffers would work, but the caller might assert + * that a lock is *not* held. + */ +bool +BufferLockHeldByMe(Buffer buffer, int mode) +{ + BufferDesc *buf; + LWLockMode lwmode; + + /* + * Can't hold a lock without a pin, there never should be uncertainty + * about having a pin. + */ + Assert(BufferIsPinned(buffer)); + + /* there'd be no correct value to return */ + Assert(!BufferIsLocal(buffer)); + + buf = GetBufferDescriptor(buffer - 1); + + if (mode == BUFFER_LOCK_EXCLUSIVE) + lwmode = LW_EXCLUSIVE; + else if (mode == BUFFER_LOCK_SHARE) + lwmode = LW_SHARED; + else + { + Assert(false); + pg_unreachable(); + lwmode = LW_EXCLUSIVE; /* assuage compiler */ + } + + return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), lwmode); +} + /* * Functions for buffer I/O handling @@ -5881,7 +6092,8 @@ IsBufferCleanupOK(Buffer buffer) */ /* - * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. + * WaitIO -- Block until the IO_IN_PROGRESS and SETTING_HINTS flags on 'buf' + * are cleared. */ static void WaitIO(BufferDesc *buf) @@ -5909,32 +6121,44 @@ WaitIO(BufferDesc *buf) iow = buf->io_wref; UnlockBufHdr(buf, buf_state); - /* no IO in progress, we don't need to wait */ - if (!(buf_state & BM_IO_IN_PROGRESS)) - break; + if (likely((buf_state & (BM_IO_IN_PROGRESS | BM_SETTING_HINTS)) == 0)) + break; /* no need to wait */ /* * The buffer has asynchronous IO in progress, wait for it to * complete. */ - if (pgaio_wref_valid(&iow)) + if (buf_state & BM_IO_IN_PROGRESS) { - pgaio_wref_wait(&iow); + Assert(!(buf_state & BM_SETTING_HINTS)); - /* - * The AIO subsystem internally uses condition variables and thus - * might remove this backend from the BufferDesc's CV. While that - * wouldn't cause a correctness issue (the first CV sleep just - * immediately returns if not already registered), it seems worth - * avoiding unnecessary loop iterations, given that we take care - * to do so at the start of the function. - */ - ConditionVariablePrepareToSleep(cv); + if (pgaio_wref_valid(&iow)) + { + pgaio_wref_wait(&iow); + + /* + * The AIO subsystem internally uses condition variables and thus + * might remove this backend from the BufferDesc's CV. While that + * wouldn't cause a correctness issue (the first CV sleep just + * immediately returns if not already registered), it seems worth + * avoiding unnecessary loop iterations, given that we take care + * to do so at the start of the function. + */ + ConditionVariablePrepareToSleep(cv); + continue; + } + + /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */ + ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO); + } + else + { + /* Wait for hint-bit setting to complete */ + Assert(buf_state & BM_SETTING_HINTS); + Assert(!(buf_state & BM_IO_IN_PROGRESS)); + ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_SETTING_HINTS); continue; } - - /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */ - ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO); } ConditionVariableCancelSleep(); } @@ -5973,7 +6197,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) { buf_state = LockBufHdr(buf); - if (!(buf_state & BM_IO_IN_PROGRESS)) + if (!(buf_state & (BM_IO_IN_PROGRESS | BM_SETTING_HINTS))) break; UnlockBufHdr(buf, buf_state); if (nowait) @@ -5990,6 +6214,8 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) return false; } + Assert(!(buf_state & BM_SETTING_HINTS)); + buf_state |= BM_IO_IN_PROGRESS; UnlockBufHdr(buf, buf_state); @@ -6028,6 +6254,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, buf_state = LockBufHdr(buf); Assert(buf_state & BM_IO_IN_PROGRESS); + Assert(!(buf_state & BM_SETTING_HINTS)); buf_state &= ~BM_IO_IN_PROGRESS; /* Clear earlier errors, if this IO failed, it'll be marked again */ @@ -6086,6 +6313,7 @@ AbortBufferIO(Buffer buffer) buf_state = LockBufHdr(buf_hdr); Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID)); + Assert(!(buf_state & BM_SETTING_HINTS)); if (!(buf_state & BM_VALID)) { @@ -6174,10 +6402,12 @@ rlocator_comparator(const void *p1, const void *p2) } /* - * Lock buffer header - set BM_LOCKED in buffer state. + * Slow path helper for LockBufHdr(). + * + * See comment in LockBufHdr() for the reason this is moved out of line. */ -uint32 -LockBufHdr(BufferDesc *desc) +pg_noinline uint32 +LockBufHdrSlow(BufferDesc *desc) { SpinDelayStatus delayStatus; uint32 old_buf_state; @@ -6563,6 +6793,246 @@ EvictUnpinnedBuffer(Buffer buf) return result; } +/* + * Try to acquire the right to set hint bits for buf. + * + * It's only permitted to set hint bits in a buffer if the buffer is not + * undergoing IO. Otherwise the page level checksum could be corrupted. This + * could happen both for PG's checksum and on the OS/filesystem + * level. E.g. btrfs with direct-io relies on the page to not change while IO + * is going on. + * + * We can't just check if IO going on at the time BufferPrepareToSetHintBits() + * is called, we also need to block IO from starting before we're done setting + * hints. This is achieved by setting BM_SETTING_HINTS for the buffer and + * having StartBufferIO()/WaitIO() wait for that. We could combine + * BM_SETTING_HINTS and BM_IO_IN_PROGRESS into one, as they can never be set + * at the same time, but that seems unnecessarily confusing. + * + * Because we use only a single bit (BM_SETTING_HINTS) to track whether hint + * bits are currently being set, we cannot allow multiple backends to set + * hints at the same time - it'd be unknown whether BM_SETTING_HINTS would + * need to be remain set when a backend finishes setting hint bits. In almost + * all situations the two backends would just set the same hint bits anyway, + * so this is unlikely to be a problem. + * + * If allowed to set hint bits, the caller needs to call + * BufferFinishSetHintBits() once done setting hint bits. In case of an error + * occuring before BufferFinishSetHintBits() is reached, the cleanup will be + * done via resowner.c. + * + * It can make sense to amortize the cost of BufferPrepareToSetHintBits() + + * BufferFinishSetHintBits() over multiple hint bit sets on a page. + * + * Returns whether caller is allowed to set hint bits. + */ +bool +BufferPrepareToSetHintBits(Buffer buffer) +{ + BufferDesc *desc; + uint32 old_buf_state; + + Assert(BufferIsPinned(buffer)); + + if (BufferIsLocal(buffer)) + return true; + + ResourceOwnerEnlarge(CurrentResourceOwner); + + desc = GetBufferDescriptor(buffer - 1); + + /* + * We could use LockBufHdr() instead, but a CAS loop turns out to be + * slightly faster and has better concurrency behaviour due to not + * blocking other backends. + */ + old_buf_state = pg_atomic_read_u32(&desc->state); + + for (;;) + { + uint32 new_buf_state; + + if (unlikely(old_buf_state & BM_LOCKED)) + old_buf_state = WaitBufHdrUnlocked(desc); + + if (unlikely((old_buf_state & (BM_SETTING_HINTS | BM_IO_IN_PROGRESS)) != 0)) + return false; + + new_buf_state = old_buf_state; + + new_buf_state |= BM_SETTING_HINTS; + + if (likely(pg_atomic_compare_exchange_u32(&desc->state, &old_buf_state, + new_buf_state))) + { + ResourceOwnerRememberBufferSettingHints(CurrentResourceOwner, buffer); + +#ifdef ENFORCE_BUFFER_PROT + if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE)) + SetBufferProtection(buffer, true, true); +#endif /* ENFORCE_BUFFER_PROT */ + + return true; + } + } +} + +/* + * Release the permission to set hint bits on the current page. + */ +void +BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std) +{ + BufferDesc *desc; + uint32 new_buf_state; + uint32 old_buf_state; + bool is_dirty = false; + + Assert(BufferIsPinned(buffer)); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + desc = GetBufferDescriptor(buffer - 1); + + /* + * We could use LockBufHdr() instead, but a CAS loop turns out to be + * slightly faster and has better concurrency behaviour due to not + * blocking other backends. + */ + old_buf_state = pg_atomic_read_u32(&desc->state); + + Assert(old_buf_state & (BM_TAG_VALID)); + Assert(old_buf_state & (BM_VALID)); + Assert(old_buf_state & (BM_SETTING_HINTS)); + Assert(!(old_buf_state & (BM_IO_IN_PROGRESS))); + + for (;;) + { + if (unlikely(old_buf_state & BM_LOCKED)) + old_buf_state = WaitBufHdrUnlocked(desc); + + is_dirty = (old_buf_state & (BM_DIRTY | BM_JUST_DIRTIED)) == + (BM_DIRTY | BM_JUST_DIRTIED); + + new_buf_state = old_buf_state; + new_buf_state &= ~BM_SETTING_HINTS; + + if (likely(pg_atomic_compare_exchange_u32(&desc->state, &old_buf_state, + new_buf_state))) + { + break; + } + } + + /* + * It might be worth combining some of the atomic operations happening + * inside MarkBufferDirtyHintImpl() with the above. + */ + if (mark_dirty && !is_dirty) + MarkBufferDirtyHintImpl(buffer, buffer_std, true); + + ResourceOwnerForgetBufferSettingHints(CurrentResourceOwner, buffer); + + /* + * Wake everyone that might be waiting for a chance to perform IO (i.e. + * WaitIO()). + * + * XXX: Right now this is somewhat expensive, due to + * ConditionVariableBroadcast() acquiring its spinlock unconditionally. I + * don't see a good way to avoid that from the bufmgr.c side, we don't + * know if there is somebody waiting. + */ + ConditionVariableBroadcast(BufferDescriptorGetIOCV(desc)); + +#ifdef ENFORCE_BUFFER_PROT + if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE)) + SetBufferProtection(buffer, true, false); +#endif /* ENFORCE_BUFFER_PROT */ +} + +/* + * Try to set 16 bit hint bit value. + * + * This is cheaper than BufferPrepareToSetHintBits() + + * BufferFinishSetHintBits() when doing a single hint bit write within a + * buffer, but more expensive when setting multiple hint bits. + * + * Assumes that the buffer has the standard layout. The buffer will be marked + * as dirty if possible. + */ +void +BufferSetHintBits16(Buffer buffer, uint16 *ptr, uint16 val) +{ + BufferDesc *desc; + uint32 buf_state; + bool did_set = false; + bool is_dirty = false; + + Assert(BufferIsPinned(buffer)); + + /* + * TODO: Add assert ensuring ptr inside the buffer. + */ + + if (unlikely(BufferIsLocal(buffer))) + { + desc = GetLocalBufferDescriptor(-buffer - 1); + buf_state = pg_atomic_read_u32(&desc->state); + + if (likely((buf_state & BM_IO_IN_PROGRESS) == 0)) + { + *ptr = val; + + is_dirty = (buf_state & (BM_DIRTY | BM_JUST_DIRTIED)) == + (BM_DIRTY | BM_JUST_DIRTIED); + did_set = true; + } + + if (did_set && !is_dirty) + MarkLocalBufferDirty(buffer); + return; + } + else + { +#ifdef ENFORCE_BUFFER_PROT + bool exclusive; +#endif + + desc = GetBufferDescriptor(buffer - 1); + +#ifdef ENFORCE_BUFFER_PROT + exclusive = LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), + LW_EXCLUSIVE); + + if (!exclusive) + SetBufferProtection(buffer, true, true); +#endif + + buf_state = LockBufHdr(desc); + + if (likely((buf_state & BM_IO_IN_PROGRESS) == 0)) + { + *ptr = val; + did_set = true; + } + + UnlockBufHdr(desc, buf_state); + + is_dirty = (buf_state & (BM_DIRTY | BM_JUST_DIRTIED)) == (BM_DIRTY | BM_JUST_DIRTIED); + if (did_set && !is_dirty) + MarkBufferDirtyHintImpl(buffer, true, true); + +#ifdef ENFORCE_BUFFER_PROT + if (!exclusive) + SetBufferProtection(buffer, true, false); +#endif + } +} + /* * Generic implementation of the AIO handle staging callback for readv/writev * on local/shared buffers. @@ -7157,6 +7627,28 @@ shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, return buffer_readv_complete(ioh, prior_result, cb_data, false); } +static void +ResOwnerReleaseBufferSettingHints(Datum res) +{ + Buffer buffer = DatumGetInt32(res); + + /* + * At this point we don't know if the buffer was a standard buffer or + * whether the caller wanted to dirty the buffer. But we don't need to + * dirty the buffer, which also means we don't need to know if the buffer + * was a standard buffer or not. + */ + BufferFinishSetHintBits(buffer, false, false); +} + +static char * +ResOwnerPrintBufferSettingHints(Datum res) +{ + Buffer buffer = DatumGetInt32(res); + + return psprintf("lost track of setting hint bits on buffer %d", buffer); +} + /* * We need a backend-local completion callback for shared buffers, to be able * to report checksum errors correctly. Unfortunately that can only safely diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index bf89076bb109..5d53bf12b030 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -196,7 +196,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber); - PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + PageSetChecksum(localpage, bufHdr->tag.blockNum); io_start = pgstat_prepare_io_time(track_io_timing); diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 4773a9cc65e6..f83c13653f17 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -904,12 +904,18 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, max_avail = fsm_get_max_avail(page); /* - * Reset the next slot pointer. This encourages the use of low-numbered - * pages, increasing the chances that a later vacuum can truncate the - * relation. We don't bother with a lock here, nor with marking the page - * dirty if it wasn't already, since this is just a hint. + * Try to reset the next slot pointer. This encourages the use of + * low-numbered pages, increasing the chances that a later vacuum can + * truncate the relation. We don't bother with a lock here, nor with + * marking the page dirty if it wasn't already, since this is just a hint. + * + * FIXME: Should probably use BufferSetHintBits16() */ - ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + if (BufferPrepareToSetHintBits(buf)) + { + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + BufferFinishSetHintBits(buf, false, false); + } ReleaseBuffer(buf); diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index 66a5c80b5a6c..169705979985 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -300,7 +300,13 @@ fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, * * Wrap-around is handled at the beginning of this function. */ - fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + if (exclusive_lock_held || BufferPrepareToSetHintBits(buf)) + { + fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + + if (!exclusive_lock_held) + BufferFinishSetHintBits(buf, false, true); + } return slot; } diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 82457bacc62f..b7ebec8a69d5 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1494,51 +1494,15 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, /* * Set checksum for a page in shared buffers. * - * If checksums are disabled, or if the page is not initialized, just return - * the input. Otherwise, we must make a copy of the page before calculating - * the checksum, to prevent concurrent modifications (e.g. setting hint bits) - * from making the final checksum invalid. It doesn't matter if we include or - * exclude hints during the copy, as long as we write a valid page and - * associated checksum. + * If checksums are disabled, or if the page is not initialized, just + * return. Otherwise compute and set the checksum. * - * Returns a pointer to the block-sized data that needs to be written. Uses - * statically-allocated memory, so the caller must immediately write the - * returned page and not refer to it again. - */ -char * -PageSetChecksumCopy(Page page, BlockNumber blkno) -{ - static char *pageCopy = NULL; - - /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) - return page; - - /* - * We allocate the copy space once and use it over on each subsequent - * call. The point of palloc'ing here, rather than having a static char - * array, is first to ensure adequate alignment for the checksumming code - * and second to avoid wasting space in processes that never call this. - */ - if (pageCopy == NULL) - pageCopy = MemoryContextAllocAligned(TopMemoryContext, - BLCKSZ, - PG_IO_ALIGN_SIZE, - 0); - - memcpy(pageCopy, page, BLCKSZ); - ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); - return pageCopy; -} - -/* - * Set checksum for a page in private memory. - * - * This must only be used when we know that no other process can be modifying - * the page buffer. + * In the past this needed to be done on a copy of the page, due to the + * possibility of e.g. hint bits being set concurrently. However, this is not + * necessary anymore as hint bits won't be set while IO is going on. */ void -PageSetChecksumInplace(Page page, BlockNumber blkno) +PageSetChecksum(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ if (PageIsNew(page) || !DataChecksumsEnabled()) diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c index b958be157164..f4d075433650 100644 --- a/src/backend/storage/smgr/bulk_write.c +++ b/src/backend/storage/smgr/bulk_write.c @@ -279,7 +279,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) BlockNumber blkno = pending_writes[i].blkno; Page page = pending_writes[i].buf->data; - PageSetChecksumInplace(page, blkno); + PageSetChecksum(page, blkno); if (blkno >= bulkstate->relsize) { diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4f44648aca81..455c55428ee2 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -112,6 +112,7 @@ BGWORKER_SHUTDOWN "Waiting for background worker to shut down." BGWORKER_STARTUP "Waiting for background worker to start up." BTREE_PAGE "Waiting for the page number needed to continue a parallel B-tree scan to become available." BUFFER_IO "Waiting for buffer I/O to complete." +BUFFER_SETTING_HINTS "Waiting for hint bit setting to complete." CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from completing." CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting." CHECKPOINT_DONE "Waiting for a checkpoint to complete." diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4eaeca89f2c7..ea8d796e7c45 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -566,7 +566,7 @@ static int ssl_renegotiation_limit; */ int huge_pages = HUGE_PAGES_TRY; int huge_page_size; -static int huge_pages_status = HUGE_PAGES_UNKNOWN; +int huge_pages_status = HUGE_PAGES_UNKNOWN; /* * These variables are all dummies that don't do anything, except in some diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 1640d9c32f7c..a5e741dd1c08 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -425,6 +425,34 @@ extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool HeapTupleIsSurelyDead(HeapTuple htup, struct GlobalVisState *vistest); +/* + * FIXME: define to be removed + * + * Without this I see worse performance. But it's a bit ugly, so I thought + * it'd be useful to leave a way in for others to experiment with this. + */ +#define BATCHMVCC_FEWER_ARGS + +#ifdef BATCHMVCC_FEWER_ARGS +typedef struct BatchMVCCState +{ + HeapTupleData tuples[MaxHeapTuplesPerPage]; + bool visible[MaxHeapTuplesPerPage]; +} BatchMVCCState; +#endif + +extern int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, + int ntups, +#ifdef BATCHMVCC_FEWER_ARGS + BatchMVCCState *batchmvcc, +#else + HeapTupleData *tuples, + bool *visible, +#endif + OffsetNumber *vistuples_dense); + + + /* * To avoid leaking too much knowledge about reorderbuffer implementation * details this is implemented in reorderbuffer.c not heapam_visibility.c diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 125d3eb5fff5..04f1e49b0e74 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -361,3 +361,27 @@ * Enable tracing of syncscan operations (see also the trace_syncscan GUC var). */ /* #define TRACE_SYNCSCAN */ + + +/* + * On some operating systems we know how to change whether pages are + * readable/writeable. We can use that to verify that we are following buffer + * locking rules, we can make pages inaccessible or read-only when we don't + * have sufficient locks etc. This obviously is fairly expensive, so by + * default we only so in assert enabled builds. + */ +#if defined(USE_ASSERT_CHECKING) && !defined(WIN32) +#define ENFORCE_BUFFER_PROT +#endif + +/* + * Protecting pages against being modified without an exclusive lock / + * BufferPrepareToSetHintBits() is reasonably cheap, neither happens *that* + * often. Pinning/unpinning buffers is a lot more common, making it more + * expensive to call mprotect() that often. + * + * Therefore disable this by default, even in assert enabled builds. + */ +#ifdef ENFORCE_BUFFER_PROT +/* #define ENFORCE_BUFFER_PROT_READ */ +#endif diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 72b36a4af26f..50f7c7c29a02 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -33,8 +33,8 @@ * Buffer state is a single 32-bit variable where following data is combined. * * - 18 bits refcount - * - 4 bits usage count - * - 10 bits of flags + * - 3 bits usage count + * - 11 bits of flags * * Combining these values allows to perform some operations without locking * the buffer header, by modifying them together with a CAS loop. @@ -42,8 +42,8 @@ * The definition of buffer state components is below. */ #define BUF_REFCOUNT_BITS 18 -#define BUF_USAGECOUNT_BITS 4 -#define BUF_FLAG_BITS 10 +#define BUF_USAGECOUNT_BITS 3 +#define BUF_FLAG_BITS 11 StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, "parts of buffer state space need to equal 32"); @@ -65,6 +65,7 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * Note: BM_TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ +#define BM_SETTING_HINTS (1U << 21) /* hint bits are being set */ #define BM_LOCKED (1U << 22) /* buffer header is locked */ #define BM_DIRTY (1U << 23) /* data needs writing */ #define BM_VALID (1U << 24) /* data is valid */ @@ -371,7 +372,38 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! */ -extern uint32 LockBufHdr(BufferDesc *desc); +extern uint32 LockBufHdrSlow(BufferDesc *desc); + +/* + * Lock buffer header - set BM_LOCKED in buffer state. + * + * Implemented as a static inline in the header due to freelist.c using it in + * a reasonably common path. + */ +static inline uint32 +LockBufHdr(BufferDesc *desc) +{ + uint32 old_buf_state; + + Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc))); + + /* fast path, intended to be inlined into calling functions*/ + + /* set BM_LOCKED flag */ + old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED); + if (likely(!(old_buf_state & BM_LOCKED))) + return old_buf_state | BM_LOCKED; + + /* + * The overhead of the spin delay code and the retry loop itself would be + * noticeable due to the buffer header lock being taken very + * frequently. Therefore it is moved into a separate function marked + * pg_noinline. + */ + + /* slow path with loop etc, not intended to be inlined */ + return LockBufHdrSlow(desc); +} static inline void UnlockBufHdr(BufferDesc *desc, uint32 buf_state) @@ -402,6 +434,7 @@ extern PGDLLIMPORT CkptSortItem *CkptBufferIds; /* ResourceOwner callbacks to hold buffer I/Os and pins */ extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc; extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc; +extern PGDLLIMPORT const ResourceOwnerDesc buffer_setting_hints_resowner_desc; /* Convenience wrappers over ResourceOwnerRemember/Forget */ static inline void @@ -424,6 +457,18 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer) { ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc); } +static inline void +ResourceOwnerRememberBufferSettingHints(ResourceOwner owner, Buffer buffer) +{ + ResourceOwnerRemember(owner, Int32GetDatum(buffer), + &buffer_setting_hints_resowner_desc); +} +static inline void +ResourceOwnerForgetBufferSettingHints(ResourceOwner owner, Buffer buffer) +{ + ResourceOwnerForget(owner, Int32GetDatum(buffer), + &buffer_setting_hints_resowner_desc); +} /* * Internal buffer management routines @@ -449,6 +494,15 @@ extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); extern bool have_free_buffer(void); +#ifdef ENFORCE_BUFFER_PROT +extern void SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes); +#else +static inline void +SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes) +{ +} +#endif + /* buf_table.c */ extern Size BufTableShmemSize(int size); extern void InitBufTable(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index f2192ceb2719..1a5d9544f0d5 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -294,6 +294,7 @@ extern void LockBufferForCleanup(Buffer buffer); extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); +extern bool BufferLockHeldByMe(Buffer buffer, int mode); extern bool BgBufferSync(struct WritebackContext *wb_context); @@ -306,6 +307,12 @@ extern void LimitAdditionalLocalPins(uint32 *additional_pins); extern bool EvictUnpinnedBuffer(Buffer buf); +extern bool BufferPrepareToSetHintBits(Buffer buffer); +extern void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std); +extern void BufferSetHintBits16(Buffer buffer, + uint16 *ptr, uint16 val); + + /* in buf_init.c */ extern void BufferManagerShmemInit(void); extern Size BufferManagerShmemSize(void); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index aeb67c498c59..c5126dda7e15 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -505,7 +505,6 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum); extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize); -extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); -extern void PageSetChecksumInplace(Page page, BlockNumber blkno); +extern void PageSetChecksum(Page page, BlockNumber blkno); #endif /* BUFPAGE_H */ diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index b99ebc9e86f5..5d0c56841d4a 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -63,6 +63,8 @@ typedef enum SHMEM_TYPE_MMAP, } PGShmemType; +extern PGDLLIMPORT int huge_pages_status; + #ifndef WIN32 extern PGDLLIMPORT unsigned long UsedShmemSegID; #else diff --git a/src/test/isolation/expected/index-killtuples.out b/src/test/isolation/expected/index-killtuples.out new file mode 100644 index 000000000000..af46a3c8ead2 --- /dev/null +++ b/src/test/isolation/expected/index-killtuples.out @@ -0,0 +1,337 @@ +Parsed test spec with 1 sessions + +starting permutation: create_table fill_500 create_btree flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_btree: CREATE INDEX kill_prior_tuple_btree ON kill_prior_tuple USING btree (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_btree on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_btree on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_btree on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_btree on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 0 +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; + +starting permutation: create_table fill_500 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table drop_ext_btree_gist +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_ext_btree_gist: CREATE EXTENSION btree_gist; +step create_gist: CREATE INDEX kill_prior_tuple_gist ON kill_prior_tuple USING gist (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 0 +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; +step drop_ext_btree_gist: DROP EXTENSION btree_gist; + +starting permutation: create_table fill_10 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table drop_ext_btree_gist +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_10: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 10) g(i); +step create_ext_btree_gist: CREATE EXTENSION btree_gist; +step create_gist: CREATE INDEX kill_prior_tuple_gist ON kill_prior_tuple USING gist (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; +step drop_ext_btree_gist: DROP EXTENSION btree_gist; + +starting permutation: create_table fill_500 create_hash flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_hash: CREATE INDEX kill_prior_tuple_hash ON kill_prior_tuple USING hash (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_hash on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_hash on kill_prior_tuple (actual rows=1 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_hash on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +---------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_hash on kill_prior_tuple (actual rows=0 loops=1) + Index Cond: (key = 1) +(2 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 0 +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; + +starting permutation: create_table fill_500 create_ext_btree_gin create_gin flush disable_seq delete flush measure access flush result measure access flush result drop_table drop_ext_btree_gin +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_ext_btree_gin: CREATE EXTENSION btree_gin; +step create_gin: CREATE INDEX kill_prior_tuple_gin ON kill_prior_tuple USING gin (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------- +Bitmap Heap Scan on kill_prior_tuple (actual rows=0 loops=1) + Recheck Cond: (key = 1) + Heap Blocks: exact=1 + -> Bitmap Index Scan on kill_prior_tuple_gin (actual rows=1 loops=1) + Index Cond: (key = 1) +(5 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; +QUERY PLAN +----------------------------------------------------------------------- +Bitmap Heap Scan on kill_prior_tuple (actual rows=0 loops=1) + Recheck Cond: (key = 1) + Heap Blocks: exact=1 + -> Bitmap Index Scan on kill_prior_tuple_gin (actual rows=1 loops=1) + Index Cond: (key = 1) +(5 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +new_heap_accesses +----------------- + 1 +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; +step drop_ext_btree_gin: DROP EXTENSION btree_gin; diff --git a/src/test/isolation/specs/index-killtuples.spec b/src/test/isolation/specs/index-killtuples.spec new file mode 100644 index 000000000000..3896c935290d --- /dev/null +++ b/src/test/isolation/specs/index-killtuples.spec @@ -0,0 +1,127 @@ +# Basic testing of killtuples / kill_prior_tuples / all_dead testing +# for various index AMs +# +# This tests just enough to ensure that the kill* routines are actually +# executed and does something approximately reasonble. It's *not* sufficient +# testing for adding killitems support to a new AM! +# +# This doesn't really need to be an isolation test, it could be written as a +# regular regression test. However, writing it as an isolation test ends up a +# *lot* less verbose. + +setup +{ + CREATE TABLE counter(heap_accesses int); + INSERT INTO counter(heap_accesses) VALUES (0); +} + +teardown +{ + DROP TABLE counter; +} + +session s1 +# to ensure GUCs are reset +setup { RESET ALL; } + +step disable_seq { SET enable_seqscan = false; } + +step disable_bitmap { SET enable_bitmapscan = false; } + +# use a temporary table to make sure no other session can interfere with +# visibility determinations +step create_table { CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); } + +step fill_10 { INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 10) g(i); } + +step fill_500 { INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); } + +# column-less select to make output easier to read +step flush { SELECT FROM pg_stat_force_next_flush(); } + +step measure { UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); } + +step result { SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; } + +step access { EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; } + +step delete { DELETE FROM kill_prior_tuple; } + +step drop_table { DROP TABLE IF EXISTS kill_prior_tuple; } + +### steps for testing btree indexes ### +step create_btree { CREATE INDEX kill_prior_tuple_btree ON kill_prior_tuple USING btree (key); } + +### steps for testing gist indexes ### +# Creating the extensions takes time, so we don't want to do so when testing +# other AMs +step create_ext_btree_gist { CREATE EXTENSION btree_gist; } +step drop_ext_btree_gist { DROP EXTENSION btree_gist; } +step create_gist { CREATE INDEX kill_prior_tuple_gist ON kill_prior_tuple USING gist (key); } + +### steps for testing gin indexes ### +# See create_ext_btree_gist +step create_ext_btree_gin { CREATE EXTENSION btree_gin; } +step drop_ext_btree_gin { DROP EXTENSION btree_gin; } +step create_gin { CREATE INDEX kill_prior_tuple_gin ON kill_prior_tuple USING gin (key); } + +### steps for testing hash indexes ### +step create_hash { CREATE INDEX kill_prior_tuple_hash ON kill_prior_tuple USING hash (key); } + + +# test killtuples with btree inex +permutation + create_table fill_500 create_btree flush + disable_seq disable_bitmap + # show each access to non-deleted tuple increments heap_blks_* + measure access flush result + measure access flush result + delete flush + # first access after accessing deleted tuple still needs to access heap + measure access flush result + # but after kill_prior_tuple did its thing, we shouldn't access heap anymore + measure access flush result + drop_table + +# Same as first permutation, except testing gist +permutation + create_table fill_500 create_ext_btree_gist create_gist flush + disable_seq disable_bitmap + measure access flush result + measure access flush result + delete flush + measure access flush result + measure access flush result + drop_table drop_ext_btree_gist + +# Test gist, but with fewer rows - shows that killitems doesn't work anymore! +permutation + create_table fill_10 create_ext_btree_gist create_gist flush + disable_seq disable_bitmap + measure access flush result + measure access flush result + delete flush + measure access flush result + measure access flush result + drop_table drop_ext_btree_gist + +# Same as first permutation, except testing hash +permutation + create_table fill_500 create_hash flush + disable_seq disable_bitmap + measure access flush result + measure access flush result + delete flush + measure access flush result + measure access flush result + drop_table + +# # Similar to first permutation, except that gin does not have killtuples support +permutation + create_table fill_500 create_ext_btree_gin create_gin flush + disable_seq + delete flush + measure access flush result + # will still fetch from heap + measure access flush result + drop_table drop_ext_btree_gin diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b66cecd87991..d4799e0baf47 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -250,6 +250,7 @@ Barrier BaseBackupCmd BaseBackupTargetHandle BaseBackupTargetType +BatchMVCCState BeginDirectModify_function BeginForeignInsert_function BeginForeignModify_function @@ -2678,6 +2679,7 @@ SetConstraintStateData SetConstraintTriggerData SetExprState SetFunctionReturnMode +SetHintBitsState SetOp SetOpCmd SetOpPath