diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index 8a8e36dde7e4..827312306f60 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -1877,7 +1877,9 @@ check_tuple(HeapCheckContext *ctx, bool *xmin_commit_status_ok, /* * Convert a TransactionId into a FullTransactionId using our cached values of * the valid transaction ID range. It is the caller's responsibility to have - * already updated the cached values, if necessary. + * already updated the cached values, if necessary. This is akin to + * FullTransactionIdFromAllowableAt(), but it tolerates corruption in the form + * of an xid before epoch 0. */ static FullTransactionId FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index d0515a180d68..ea0a12b39af7 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -49,8 +49,10 @@ #include "storage/predicate.h" #include "storage/procarray.h" #include "utils/datum.h" +#include "utils/injection_point.h" #include "utils/inval.h" #include "utils/spccache.h" +#include "utils/syscache.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -3254,6 +3256,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, interesting_attrs = bms_add_members(interesting_attrs, id_attrs); block = ItemPointerGetBlockNumber(otid); + INJECTION_POINT("heap_update-before-pin"); buffer = ReadBuffer(relation, block); page = BufferGetPage(buffer); @@ -3269,7 +3272,51 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid)); - Assert(ItemIdIsNormal(lp)); + + /* + * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring + * we see LP_NORMAL here. When the otid origin is a syscache, we may have + * neither a pin nor a snapshot. Hence, we may see other LP_ states, each + * of which indicates concurrent pruning. + * + * Failing with TM_Updated would be most accurate. However, unlike other + * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and + * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted + * does matter to SQL statements UPDATE and MERGE, those SQL statements + * hold a snapshot that ensures LP_NORMAL. Hence, the choice between + * TM_Updated and TM_Deleted affects only the wording of error messages. + * Settle on TM_Deleted, for two reasons. First, it avoids complicating + * the specification of when tmfd->ctid is valid. Second, it creates + * error log evidence that we took this branch. + * + * Since it's possible to see LP_UNUSED at otid, it's also possible to see + * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an + * unrelated row, we'll fail with "duplicate key value violates unique". + * XXX if otid is the live, newer version of the newtup row, we'll discard + * changes originating in versions of this catalog row after the version + * the caller got from syscache. See syscache-update-pruned.spec. + */ + if (!ItemIdIsNormal(lp)) + { + Assert(RelationSupportsSysCache(RelationGetRelid(relation))); + + UnlockReleaseBuffer(buffer); + Assert(!have_tuple_lock); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + tmfd->ctid = *otid; + tmfd->xmax = InvalidTransactionId; + tmfd->cmax = InvalidCommandId; + *update_indexes = TU_None; + + bms_free(hot_attrs); + bms_free(sum_attrs); + bms_free(key_attrs); + bms_free(id_attrs); + /* modified_attrs not yet initialized */ + bms_free(interesting_attrs); + return TM_Deleted; + } /* * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index ab2f4a8a92fd..73a80559194e 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -929,32 +929,16 @@ TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) /* * Compute the FullTransactionId for the given TransactionId. * - * The wrap logic is safe here because the span of active xids cannot exceed one - * epoch at any given time. + * This is safe if the xid has not yet reached COMMIT PREPARED or ROLLBACK + * PREPARED. After those commands, concurrent vac_truncate_clog() may make + * the xid cease to qualify as allowable. XXX Not all callers limit their + * calls accordingly. */ static inline FullTransactionId AdjustToFullTransactionId(TransactionId xid) { - FullTransactionId nextFullXid; - TransactionId nextXid; - uint32 epoch; - Assert(TransactionIdIsValid(xid)); - - LWLockAcquire(XidGenLock, LW_SHARED); - nextFullXid = TransamVariables->nextXid; - LWLockRelease(XidGenLock); - - nextXid = XidFromFullTransactionId(nextFullXid); - epoch = EpochFromFullTransactionId(nextFullXid); - if (unlikely(xid > nextXid)) - { - /* Wraparound occurred, must be from a prev epoch. */ - Assert(epoch > 0); - epoch--; - } - - return FullTransactionIdFromEpochAndXid(epoch, xid); + return FullTransactionIdFromAllowableAt(ReadNextFullTransactionId(), xid); } static inline int diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 3596af06172e..91b6a91767d6 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -2166,28 +2166,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) FullTransactionId XLogRecGetFullXid(XLogReaderState *record) { - TransactionId xid, - next_xid; - uint32 epoch; - /* * This function is only safe during replay, because it depends on the * replay state. See AdvanceNextFullTransactionIdPastXid() for more. */ Assert(AmStartupProcess() || !IsUnderPostmaster); - xid = XLogRecGetXid(record); - next_xid = XidFromFullTransactionId(TransamVariables->nextXid); - epoch = EpochFromFullTransactionId(TransamVariables->nextXid); - - /* - * If xid is numerically greater than next_xid, it has to be from the last - * epoch. - */ - if (unlikely(xid > next_xid)) - --epoch; - - return FullTransactionIdFromEpochAndXid(epoch, xid); + return FullTransactionIdFromAllowableAt(TransamVariables->nextXid, + XLogRecGetXid(record)); } #endif diff --git a/src/backend/utils/adt/xid8funcs.c b/src/backend/utils/adt/xid8funcs.c index 4736755b2985..20b28b2528c8 100644 --- a/src/backend/utils/adt/xid8funcs.c +++ b/src/backend/utils/adt/xid8funcs.c @@ -97,15 +97,11 @@ static bool TransactionIdInRecentPast(FullTransactionId fxid, TransactionId *extracted_xid) { TransactionId xid = XidFromFullTransactionId(fxid); - uint32 now_epoch; - TransactionId now_epoch_next_xid; FullTransactionId now_fullxid; - TransactionId oldest_xid; - FullTransactionId oldest_fxid; + TransactionId oldest_clog_xid; + FullTransactionId oldest_clog_fxid; now_fullxid = ReadNextFullTransactionId(); - now_epoch_next_xid = XidFromFullTransactionId(now_fullxid); - now_epoch = EpochFromFullTransactionId(now_fullxid); if (extracted_xid != NULL) *extracted_xid = xid; @@ -135,52 +131,19 @@ TransactionIdInRecentPast(FullTransactionId fxid, TransactionId *extracted_xid) /* * If fxid is not older than TransamVariables->oldestClogXid, the relevant - * CLOG entry is guaranteed to still exist. Convert - * TransamVariables->oldestClogXid into a FullTransactionId to compare it - * with fxid. Determine the right epoch knowing that oldest_fxid - * shouldn't be more than 2^31 older than now_fullxid. - */ - oldest_xid = TransamVariables->oldestClogXid; - Assert(TransactionIdPrecedesOrEquals(oldest_xid, now_epoch_next_xid)); - if (oldest_xid <= now_epoch_next_xid) - { - oldest_fxid = FullTransactionIdFromEpochAndXid(now_epoch, oldest_xid); - } - else - { - Assert(now_epoch > 0); - oldest_fxid = FullTransactionIdFromEpochAndXid(now_epoch - 1, oldest_xid); - } - return !FullTransactionIdPrecedes(fxid, oldest_fxid); -} - -/* - * Convert a TransactionId obtained from a snapshot held by the caller to a - * FullTransactionId. Use next_fxid as a reference FullTransactionId, so that - * we can compute the high order bits. It must have been obtained by the - * caller with ReadNextFullTransactionId() after the snapshot was created. - */ -static FullTransactionId -widen_snapshot_xid(TransactionId xid, FullTransactionId next_fxid) -{ - TransactionId next_xid = XidFromFullTransactionId(next_fxid); - uint32 epoch = EpochFromFullTransactionId(next_fxid); - - /* Special transaction ID. */ - if (!TransactionIdIsNormal(xid)) - return FullTransactionIdFromEpochAndXid(0, xid); - - /* - * The 64 bit result must be <= next_fxid, since next_fxid hadn't been - * issued yet when the snapshot was created. Every TransactionId in the - * snapshot must therefore be from the same epoch as next_fxid, or the - * epoch before. We know this because next_fxid is never allow to get - * more than one epoch ahead of the TransactionIds in any snapshot. + * CLOG entry is guaranteed to still exist. + * + * TransamVariables->oldestXid governs allowable XIDs. Usually, + * oldestClogXid==oldestXid. It's also possible for oldestClogXid to + * follow oldestXid, in which case oldestXid might advance after our + * ReadNextFullTransactionId() call. If oldestXid has advanced, that + * advancement reinstated the usual oldestClogXid==oldestXid. Whether or + * not that happened, oldestClogXid is allowable relative to now_fullxid. */ - if (xid > next_xid) - epoch--; - - return FullTransactionIdFromEpochAndXid(epoch, xid); + oldest_clog_xid = TransamVariables->oldestClogXid; + oldest_clog_fxid = + FullTransactionIdFromAllowableAt(now_fullxid, oldest_clog_xid); + return !FullTransactionIdPrecedes(fxid, oldest_clog_fxid); } /* @@ -420,12 +383,18 @@ pg_current_snapshot(PG_FUNCTION_ARGS) nxip = cur->xcnt; snap = palloc(PG_SNAPSHOT_SIZE(nxip)); - /* fill */ - snap->xmin = widen_snapshot_xid(cur->xmin, next_fxid); - snap->xmax = widen_snapshot_xid(cur->xmax, next_fxid); + /* + * Fill. This is the current backend's active snapshot, so MyProc->xmin + * is <= all these XIDs. As long as that remains so, oldestXid can't + * advance past any of these XIDs. Hence, these XIDs remain allowable + * relative to next_fxid. + */ + snap->xmin = FullTransactionIdFromAllowableAt(next_fxid, cur->xmin); + snap->xmax = FullTransactionIdFromAllowableAt(next_fxid, cur->xmax); snap->nxip = nxip; for (i = 0; i < nxip; i++) - snap->xip[i] = widen_snapshot_xid(cur->xip[i], next_fxid); + snap->xip[i] = + FullTransactionIdFromAllowableAt(next_fxid, cur->xip[i]); /* * We want them guaranteed to be in ascending order. This also removes diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index f41d314eae3f..32cf28bb8bc7 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -123,6 +123,7 @@ #include "storage/sinval.h" #include "storage/smgr.h" #include "utils/catcache.h" +#include "utils/injection_point.h" #include "utils/inval.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -1134,6 +1135,8 @@ AtEOXact_Inval(bool isCommit) /* Must be at top of stack */ Assert(transInvalInfo->my_level == 1 && transInvalInfo->parent == NULL); + INJECTION_POINT("AtEOXact_Inval-with-transInvalInfo"); + if (isCommit) { /* diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 09b9b394e0e2..131c050c15f1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -136,7 +136,8 @@ typedef enum TU_UpdateIndexes * * xmax is the outdating transaction's XID. If the caller wants to visit the * replacement tuple, it must check that this matches before believing the - * replacement is really a match. + * replacement is really a match. This is InvalidTransactionId if the target + * was !LP_NORMAL (expected only for a TID retrieved from syscache). * * cmax is the outdating command's CID, but only when the failure code is * TM_SelfModified (i.e., something in the current transaction outdated the diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 0cab8653f1b6..7d82cd2eb562 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -370,6 +370,49 @@ FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) return b; } +/* + * Compute FullTransactionId for the given TransactionId, assuming xid was + * between [oldestXid, nextXid] at the time when TransamVariables->nextXid was + * nextFullXid. When adding calls, evaluate what prevents xid from preceding + * oldestXid if SetTransactionIdLimit() runs between the collection of xid and + * the collection of nextFullXid. + */ +static inline FullTransactionId +FullTransactionIdFromAllowableAt(FullTransactionId nextFullXid, + TransactionId xid) +{ + uint32 epoch; + + /* Special transaction ID. */ + if (!TransactionIdIsNormal(xid)) + return FullTransactionIdFromEpochAndXid(0, xid); + + Assert(TransactionIdPrecedesOrEquals(xid, + XidFromFullTransactionId(nextFullXid))); + + /* + * The 64 bit result must be <= nextFullXid, since nextFullXid hadn't been + * issued yet when xid was in the past. The xid must therefore be from + * the epoch of nextFullXid or the epoch before. We know this because we + * must remove (by freezing) an XID before assigning the XID half an epoch + * ahead of it. + * + * The unlikely() branch hint is dubious. It's perfect for the first 2^32 + * XIDs of a cluster's life. Right at 2^32 XIDs, misprediction shoots to + * 100%, then improves until perfection returns 2^31 XIDs later. Since + * current callers pass relatively-recent XIDs, expect >90% prediction + * accuracy overall. This favors average latency over tail latency. + */ + epoch = EpochFromFullTransactionId(nextFullXid); + if (unlikely(xid > XidFromFullTransactionId(nextFullXid))) + { + Assert(epoch != 0); + epoch--; + } + + return FullTransactionIdFromEpochAndXid(epoch, xid); +} + #endif /* FRONTEND */ #endif /* TRANSAM_H */ diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index 0753a9df58c8..4f0161fd33a2 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -5,7 +5,8 @@ OBJS = \ $(WIN32RES) \ injection_points.o \ injection_stats.o \ - injection_stats_fixed.o + injection_stats_fixed.o \ + regress_injection.o EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" @@ -13,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points" REGRESS = injection_points reindex_conc REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress -ISOLATION = basic inplace +ISOLATION = basic inplace syscache-update-pruned TAP_TESTS = 1 diff --git a/src/test/modules/injection_points/expected/syscache-update-pruned.out b/src/test/modules/injection_points/expected/syscache-update-pruned.out new file mode 100644 index 000000000000..5dc5a1ddc5e4 --- /dev/null +++ b/src/test/modules/injection_points/expected/syscache-update-pruned.out @@ -0,0 +1,87 @@ +Parsed test spec with 4 sessions + +starting permutation: cachefill1 at2 waitprunable4 vac4 grant1 wakeinval4 wakegrant4 +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +ERROR: tuple concurrently deleted +step wakegrant4: <... completed> + +starting permutation: cachefill1 at2 waitprunable4 vac4 grant1 wakeinval4 mkrels4 wakegrant4 +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step mkrels4: + SELECT FROM vactest.mkrels('intruder', 1, 100); -- repopulate LP_UNUSED + +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +ERROR: duplicate key value violates unique constraint "pg_class_oid_index" +step wakegrant4: <... completed> + +starting permutation: snap3 cachefill1 at2 mkrels4 r3 waitprunable4 vac4 grant1 wakeinval4 at4 wakegrant4 inspect4 +step snap3: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT; +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step mkrels4: + SELECT FROM vactest.mkrels('intruder', 1, 100); -- repopulate LP_UNUSED + +step r3: ROLLBACK; +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step at4: ALTER TABLE vactest.child50 INHERIT vactest.orig50; +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +step wakegrant4: <... completed> +step inspect4: + SELECT relhastriggers, relhassubclass FROM pg_class + WHERE oid = 'vactest.orig50'::regclass; + +relhastriggers|relhassubclass +--------------+-------------- +f |f +(1 row) + diff --git a/src/test/modules/injection_points/expected/syscache-update-pruned_1.out b/src/test/modules/injection_points/expected/syscache-update-pruned_1.out new file mode 100644 index 000000000000..b18857c902eb --- /dev/null +++ b/src/test/modules/injection_points/expected/syscache-update-pruned_1.out @@ -0,0 +1,86 @@ +Parsed test spec with 4 sessions + +starting permutation: cachefill1 at2 waitprunable4 vac4 grant1 wakeinval4 wakegrant4 +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +step wakegrant4: <... completed> + +starting permutation: cachefill1 at2 waitprunable4 vac4 grant1 wakeinval4 mkrels4 wakegrant4 +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step mkrels4: + SELECT FROM vactest.mkrels('intruder', 1, 100); -- repopulate LP_UNUSED + +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +step wakegrant4: <... completed> + +starting permutation: snap3 cachefill1 at2 mkrels4 r3 waitprunable4 vac4 grant1 wakeinval4 at4 wakegrant4 inspect4 +step snap3: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT; +step cachefill1: SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); +step at2: + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); + +step mkrels4: + SELECT FROM vactest.mkrels('intruder', 1, 100); -- repopulate LP_UNUSED + +step r3: ROLLBACK; +step waitprunable4: CALL vactest.wait_prunable(); +step vac4: VACUUM pg_class; +step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; +step wakeinval4: + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); + +step at2: <... completed> +step wakeinval4: <... completed> +step at4: ALTER TABLE vactest.child50 INHERIT vactest.orig50; +step wakegrant4: + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); + +step grant1: <... completed> +ERROR: tuple concurrently updated +step wakegrant4: <... completed> +step inspect4: + SELECT relhastriggers, relhassubclass FROM pg_class + WHERE oid = 'vactest.orig50'::regclass; + +relhastriggers|relhassubclass +--------------+-------------- +t |t +(1 row) + diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql index 6c81d55e0d36..c445bf64e623 100644 --- a/src/test/modules/injection_points/injection_points--1.0.sql +++ b/src/test/modules/injection_points/injection_points--1.0.sql @@ -97,3 +97,11 @@ CREATE FUNCTION injection_points_stats_fixed(OUT numattach int8, RETURNS record AS 'MODULE_PATHNAME', 'injection_points_stats_fixed' LANGUAGE C STRICT; + +-- +-- regress_injection.c functions +-- +CREATE FUNCTION removable_cutoff(rel regclass) +RETURNS xid8 +AS 'MODULE_PATHNAME' +LANGUAGE C CALLED ON NULL INPUT; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index 989b4db226be..259045e5c2d4 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -8,6 +8,7 @@ injection_points_sources = files( 'injection_points.c', 'injection_stats.c', 'injection_stats_fixed.c', + 'regress_injection.c', ) if host_system == 'windows' @@ -44,7 +45,9 @@ tests += { 'specs': [ 'basic', 'inplace', + 'syscache-update-pruned', ], + 'runningcheck': false, # see syscache-update-pruned }, 'tap': { 'env': { diff --git a/src/test/modules/injection_points/regress_injection.c b/src/test/modules/injection_points/regress_injection.c new file mode 100644 index 000000000000..422f4168935f --- /dev/null +++ b/src/test/modules/injection_points/regress_injection.c @@ -0,0 +1,71 @@ +/*-------------------------------------------------------------------------- + * + * regress_injection.c + * Functions supporting test-specific subject matter. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/test/modules/injection_points/regress_injection.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "storage/procarray.h" +#include "utils/xid8.h" + +/* + * removable_cutoff - for syscache-update-pruned.spec + * + * Wrapper around GetOldestNonRemovableTransactionId(). In general, this can + * move backward. runningcheck=false isolation tests can reasonably prevent + * that. For the causes of backward movement, see + * postgr.es/m/CAEze2Wj%2BV0kTx86xB_YbyaqTr5hnE_igdWAwuhSyjXBYscf5-Q%40mail.gmail.com + * and the header comment for ComputeXidHorizons(). One can assume this + * doesn't move backward if one arranges for concurrent activity not to reach + * AbortTransaction() and not to allocate an XID while connected to another + * database. Non-runningcheck tests can control most concurrent activity, + * except autovacuum and the isolationtester control connection. Neither + * allocates XIDs, and AbortTransaction() in those would justify test failure. + */ +PG_FUNCTION_INFO_V1(removable_cutoff); +Datum +removable_cutoff(PG_FUNCTION_ARGS) +{ + Relation rel = NULL; + TransactionId xid; + FullTransactionId next_fxid_before, + next_fxid; + + /* could take other relkinds callee takes, but we've not yet needed it */ + if (!PG_ARGISNULL(0)) + rel = table_open(PG_GETARG_OID(0), AccessShareLock); + + /* + * No lock or snapshot necessarily prevents oldestXid from advancing past + * "xid" while this function runs. That concerns us only in that we must + * not ascribe "xid" to the wrong epoch. (That may never arise in + * isolation testing, but let's set a good example.) As a crude solution, + * retry until nextXid doesn't change. + */ + next_fxid = ReadNextFullTransactionId(); + do + { + CHECK_FOR_INTERRUPTS(); + next_fxid_before = next_fxid; + xid = GetOldestNonRemovableTransactionId(rel); + next_fxid = ReadNextFullTransactionId(); + } while (!FullTransactionIdEquals(next_fxid, next_fxid_before)); + + if (rel) + table_close(rel, AccessShareLock); + + PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromAllowableAt(next_fxid, + xid)); +} diff --git a/src/test/modules/injection_points/specs/syscache-update-pruned.spec b/src/test/modules/injection_points/specs/syscache-update-pruned.spec new file mode 100644 index 000000000000..b48e897431e3 --- /dev/null +++ b/src/test/modules/injection_points/specs/syscache-update-pruned.spec @@ -0,0 +1,179 @@ +# Test race conditions involving: +# - s1: heap_update($FROM_SYSCACHE), without a snapshot or pin +# - s2: ALTER TABLE making $FROM_SYSCACHE a dead tuple +# - s3: "VACUUM pg_class" making $FROM_SYSCACHE become LP_UNUSED + +# This is a derivative work of inplace.spec, which exercises the corresponding +# race condition for inplace updates. + +# Despite local injection points, this is incompatible with runningcheck. +# First, removable_cutoff() could move backward, per its header comment. +# Second, other activity could trigger sinval queue overflow, negating our +# efforts to delay inval. Third, this deadlock emerges: +# +# - step at2 waits at an injection point, with interrupts held +# - an unrelated backend waits for at2 to do PROCSIGNAL_BARRIER_SMGRRELEASE +# - step waitprunable4 waits for the unrelated backend to release its xmin + +# The alternative expected output is for -DCATCACHE_FORCE_RELEASE, a setting +# that thwarts testing the race conditions this spec seeks. + + +# Need s2 to make a non-HOT update. Otherwise, "VACUUM pg_class" would leave +# an LP_REDIRECT that persists. To get non-HOT, make rels so the pg_class row +# for vactest.orig50 is on a filled page (assuming BLCKSZ=8192). Just to save +# on filesystem syscalls, use relkind=c for every other rel. +setup +{ + CREATE EXTENSION injection_points; + CREATE SCHEMA vactest; + -- Ensure a leader RELOID catcache entry. PARALLEL RESTRICTED since a + -- parallel worker running pg_relation_filenode() would lack that effect. + CREATE FUNCTION vactest.reloid_catcache_set(regclass) RETURNS int + LANGUAGE sql PARALLEL RESTRICTED + AS 'SELECT 0 FROM pg_relation_filenode($1)'; + CREATE FUNCTION vactest.mkrels(text, int, int) RETURNS void + LANGUAGE plpgsql SET search_path = vactest AS $$ + DECLARE + tname text; + BEGIN + FOR i in $2 .. $3 LOOP + tname := $1 || i; + EXECUTE FORMAT('CREATE TYPE ' || tname || ' AS ()'); + RAISE DEBUG '% at %', tname, ctid + FROM pg_class WHERE oid = tname::regclass; + END LOOP; + END + $$; + CREATE PROCEDURE vactest.wait_prunable() LANGUAGE plpgsql AS $$ + DECLARE + barrier xid8; + cutoff xid8; + BEGIN + barrier := pg_current_xact_id(); + -- autovacuum worker RelationCacheInitializePhase3() or the + -- isolationtester control connection might hold a snapshot that + -- limits pruning. Sleep until that clears. + LOOP + ROLLBACK; -- release MyProc->xmin, which could be the oldest + cutoff := removable_cutoff('pg_class'); + EXIT WHEN cutoff >= barrier; + RAISE LOG 'removable cutoff %; waiting for %', cutoff, barrier; + PERFORM pg_sleep(.1); + END LOOP; + END + $$; +} +setup { CALL vactest.wait_prunable(); -- maximize next two VACUUMs } +setup { VACUUM FULL pg_class; -- reduce free space } +setup { VACUUM FREEZE pg_class; -- populate fsm etc. } +setup +{ + SELECT FROM vactest.mkrels('orig', 1, 49); + CREATE TABLE vactest.orig50 (c int) WITH (autovacuum_enabled = off); + CREATE TABLE vactest.child50 (c int) WITH (autovacuum_enabled = off); + SELECT FROM vactest.mkrels('orig', 51, 100); +} +teardown +{ + DROP SCHEMA vactest CASCADE; + DROP EXTENSION injection_points; +} + +# Wait during GRANT. Disable debug_discard_caches, since we're here to +# exercise an outcome that happens under permissible cache staleness. +session s1 +setup { + SET debug_discard_caches = 0; + SELECT FROM injection_points_set_local(); + SELECT FROM injection_points_attach('heap_update-before-pin', 'wait'); +} +step cachefill1 { SELECT FROM vactest.reloid_catcache_set('vactest.orig50'); } +step grant1 { GRANT SELECT ON vactest.orig50 TO PUBLIC; } + +# Update of the tuple that grant1 will update. Wait before sending invals, so +# s1 will not get a cache miss. Choose the commands for making such updates +# from among those whose heavyweight locking does not conflict with GRANT's +# heavyweight locking. (GRANT will see our XID as committed, so observing +# that XID in the tuple xmax also won't block GRANT.) +session s2 +setup { + SELECT FROM injection_points_set_local(); + SELECT FROM + injection_points_attach('AtEOXact_Inval-with-transInvalInfo', 'wait'); +} +step at2 { + CREATE TRIGGER to_set_relhastriggers BEFORE UPDATE ON vactest.orig50 + FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger(); +} + +# Hold snapshot to block pruning. +session s3 +step snap3 { BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT; } +step r3 { ROLLBACK; } + +# Non-blocking actions. +session s4 +step waitprunable4 { CALL vactest.wait_prunable(); } +step vac4 { VACUUM pg_class; } +# Reuse the lp that s1 is waiting to change. I've observed reuse at the 1st +# or 18th CREATE, so create excess. +step mkrels4 { + SELECT FROM vactest.mkrels('intruder', 1, 100); -- repopulate LP_UNUSED +} +step wakegrant4 { + SELECT FROM injection_points_detach('heap_update-before-pin'); + SELECT FROM injection_points_wakeup('heap_update-before-pin'); +} +step at4 { ALTER TABLE vactest.child50 INHERIT vactest.orig50; } +step wakeinval4 { + SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo'); + SELECT FROM injection_points_wakeup('AtEOXact_Inval-with-transInvalInfo'); +} +# Witness effects of steps at2 and/or at4. +step inspect4 { + SELECT relhastriggers, relhassubclass FROM pg_class + WHERE oid = 'vactest.orig50'::regclass; +} + +# TID from syscache becomes LP_UNUSED. Before the bug fix, this permutation +# made s1 fail with "attempted to update invisible tuple" or an assert. +# However, suppose a pd_lsn value such that (pd_lsn.xlogid, pd_lsn.xrecoff) +# passed for (xmin, xmax) with xmin known-committed and xmax known-aborted. +# Persistent page header corruption ensued. For example, s1 overwrote +# pd_lower, pd_upper, and pd_special as though they were t_ctid. +permutation + cachefill1 # reads pg_class tuple T0, xmax invalid + at2 # T0 dead, T1 live + waitprunable4 # T0 prunable + vac4 # T0 becomes LP_UNUSED + grant1 # pauses at heap_update(T0) + wakeinval4(at2) # at2 sends inval message + wakegrant4(grant1) # s1 wakes: "tuple concurrently deleted" + +# add mkrels4: LP_UNUSED becomes a different rel's row +permutation + cachefill1 # reads pg_class tuple T0, xmax invalid + at2 # T0 dead, T1 live + waitprunable4 # T0 prunable + vac4 # T0 becomes LP_UNUSED + grant1 # pauses at heap_update(T0) + wakeinval4(at2) # at2 sends inval message + mkrels4 # T0 becomes a new rel + wakegrant4(grant1) # s1 wakes: "duplicate key value violates unique" + +# TID from syscache becomes LP_UNUSED, then becomes a newer version of the +# original rel's row. +permutation + snap3 # sets MyProc->xmin + cachefill1 # reads pg_class tuple T0, xmax invalid + at2 # T0 dead, T1 live + mkrels4 # T1's page becomes full + r3 # clears MyProc->xmin + waitprunable4 # T0 prunable + vac4 # T0 becomes LP_UNUSED + grant1 # pauses at heap_update(T0) + wakeinval4(at2) # at2 sends inval message + at4 # T1 dead, T0 live + wakegrant4(grant1) # s1 wakes: T0 dead, T2 live + inspect4 # observe loss of at2+at4 changes XXX is an extant bug