diff options
author | Noah Misch | 2020-08-15 17:15:53 +0000 |
---|---|---|
committer | Noah Misch | 2020-08-15 17:15:57 +0000 |
commit | 5ae01df6f6ef590fd73299050c7112f69c03f3de (patch) | |
tree | aa59b23c76d961b8d6b8b79c935eb92fe10f86d1 | |
parent | 7241edc4fa8cda23d04bfe3cd139ed27904eac04 (diff) |
Prevent concurrent SimpleLruTruncate() for any given SLRU.
The SimpleLruTruncate() header comment states the new coding rule. To
achieve this, add locktype "frozenid" and two LWLocks. This closes a
rare opportunity for data loss, which manifested as "apparent
wraparound" or "could not access status of transaction" errors. Data
loss is more likely in pg_multixact, due to released branches' thin
margin between multiStopLimit and multiWrapLimit. If a user's physical
replication primary logged ": apparent wraparound" messages, the user
should rebuild standbys of that primary regardless of symptoms. At less
risk is a cluster having emitted "not accepting commands" errors or
"must be vacuumed" warnings at some point. One can test a cluster for
this data loss by running VACUUM FREEZE in every database. Back-patch
to 9.5 (all supported versions).
Discussion: https://postgr.es/m/[email protected]
-rw-r--r-- | doc/src/sgml/catalogs.sgml | 4 | ||||
-rw-r--r-- | doc/src/sgml/monitoring.sgml | 20 | ||||
-rw-r--r-- | src/backend/access/transam/slru.c | 8 | ||||
-rw-r--r-- | src/backend/access/transam/subtrans.c | 4 | ||||
-rw-r--r-- | src/backend/commands/async.c | 37 | ||||
-rw-r--r-- | src/backend/commands/vacuum.c | 13 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lmgr.c | 20 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlocknames.txt | 2 | ||||
-rw-r--r-- | src/backend/utils/adt/lockfuncs.c | 12 | ||||
-rw-r--r-- | src/include/storage/lmgr.h | 3 | ||||
-rw-r--r-- | src/include/storage/lock.h | 10 |
11 files changed, 118 insertions, 15 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index b0376d4b54..e611f0d086 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -8006,7 +8006,8 @@ and general database objects (identified by class OID and object OID, in the same way as in <structname>pg_description</structname> or <structname>pg_depend</structname>). Also, the right to extend a - relation is represented as a separate lockable object. + relation is represented as a separate lockable object, as is the right to + update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>. Also, <quote>advisory</> locks can be taken on numbers that have user-defined meanings. </para> @@ -8032,6 +8033,7 @@ Type of the lockable object: <literal>relation</>, <literal>extend</>, + <literal>frozenid</literal>, <literal>page</>, <literal>tuple</>, <literal>transactionid</>, diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 31bad1590e..879c7d8c16 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -789,7 +789,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <tbody> <row> - <entry morerows="41"><literal>LWLockNamed</></entry> + <entry morerows="43"><literal>LWLockNamed</></entry> <entry><literal>ShmemIndexLock</></entry> <entry>Waiting to find or allocate space in shared memory.</entry> </row> @@ -975,6 +975,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <entry>Waiting to read or update old snapshot control information.</entry> </row> <row> + <entry><literal>WrapLimitsVacuumLock</literal></entry> + <entry>Waiting to update limits on transaction id and multixact + consumption.</entry> + </row> + <row> + <entry><literal>NotifyQueueTailLock</literal></entry> + <entry>Waiting to update limit on notification message + storage.</entry> + </row> + <row> <entry morerows="15"><literal>LWLockTranche</></entry> <entry><literal>clog</></entry> <entry>Waiting for I/O on a clog (transaction status) buffer.</entry> @@ -1042,7 +1052,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <entry>Waiting to add or examine predicate lock information.</entry> </row> <row> - <entry morerows="9"><literal>Lock</></entry> + <entry morerows="10"><literal>Lock</></entry> <entry><literal>relation</></entry> <entry>Waiting to acquire a lock on a relation.</entry> </row> @@ -1051,6 +1061,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <entry>Waiting to extend a relation.</entry> </row> <row> + <entry><literal>frozenid</literal></entry> + <entry>Waiting to + update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield> + and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry> + </row> + <row> <entry><literal>page</></entry> <entry>Waiting to acquire a lock on page of a relation.</entry> </row> diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index e708967e2f..1f65684491 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1154,6 +1154,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) /* * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. */ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 908fe2d533..f040915804 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -335,8 +335,8 @@ ExtendSUBTRANS(TransactionId newestXact) /* * Remove all SUBTRANS segments before the one holding the passed transaction ID * - * This is normally called during checkpoint, with oldestXact being the - * oldest TransactionXmin of any running transaction. + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. */ void TruncateSUBTRANS(TransactionId oldestXact) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 3c78eb86f5..125f3b451d 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -224,19 +224,22 @@ typedef struct QueueBackendStatus /* * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * - * The AsyncQueueControl structure is protected by the AsyncQueueLock. + * The AsyncQueueControl structure is protected by the AsyncQueueLock and + * NotifyQueueTailLock. * - * When holding the lock in SHARED mode, backends may only inspect their own - * entries as well as the head and tail pointers. Consequently we can allow a - * backend to update its own record while holding only SHARED lock (since no - * other backend will inspect it). + * When holding AsyncQueueLock in SHARED mode, backends may only inspect their + * own entries as well as the head and tail pointers. Consequently we can + * allow a backend to update its own record while holding only SHARED lock + * (since no other backend will inspect it). * - * When holding the lock in EXCLUSIVE mode, backends can inspect the entries - * of other backends and also change the head and tail pointers. + * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the + * entries of other backends and also change the head pointer. When holding + * both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can + * change the tail pointer. * * AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers. - * In order to avoid deadlocks, whenever we need both locks, we always first - * get AsyncQueueLock and then AsyncCtlLock. + * In order to avoid deadlocks, whenever we need multiple locks, we first get + * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock. * * Each backend uses the backend[] array entry with index equal to its * BackendId (which can range from 1 to MaxBackends). We rely on this to make @@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void) int newtailpage; int boundary; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE); + + /* Compute the new tail. */ LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE); min = QUEUE_HEAD; for (i = 1; i <= MaxBackends; i++) @@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void) min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i)); } oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL); - QUEUE_TAIL = min; LWLockRelease(AsyncQueueLock); /* @@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void) */ SimpleLruTruncate(AsyncCtl, newtailpage); } + + /* + * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for + * the segment immediately prior to the new tail, allowing fresh data into + * that segment. + */ + LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE); + QUEUE_TAIL = min; + LWLockRelease(AsyncQueueLock); + + LWLockRelease(NotifyQueueTailLock); } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 52c4c65406..edc7642cc4 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -892,6 +892,14 @@ vac_update_datfrozenxid(void) bool dirty = false; /* + * Restrict this task to one backend per database. This avoids race + * conditions that would move datfrozenxid or datminmxid backward. It + * avoids calling vac_truncate_clog() with a datfrozenxid preceding a + * datfrozenxid passed to an earlier vac_truncate_clog() call. + */ + LockDatabaseFrozenIds(ExclusiveLock); + + /* * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- * committed pg_class entries for new tables; see AddNewRelationTuple(). @@ -1055,6 +1063,9 @@ vac_truncate_clog(TransactionId frozenXID, bool bogus = false; bool frozenAlreadyWrapped = false; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE); + /* init oldest datoids to sync with my frozenXID/minMulti values */ oldestxid_datoid = MyDatabaseId; minmulti_datoid = MyDatabaseId; @@ -1156,6 +1167,8 @@ vac_truncate_clog(TransactionId frozenXID, SetTransactionIdLimit(frozenXID, oldestxid_datoid); SetMultiXactIdLimit(minMulti, minmulti_datoid); AdvanceOldestCommitTsXid(frozenXID); + + LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 0b9f105b53..7b87002d7c 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -413,6 +413,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) } /* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* * LockPage * * Obtain a page-level lock. This is currently used by some index access @@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field2, tag->locktag_field1); break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; case LOCKTAG_PAGE: appendStringInfo(buf, _("page %u of relation %u of database %u"), diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index f8996cd21a..d9afe60e29 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -47,3 +47,5 @@ CommitTsLock 39 ReplicationOriginLock 40 MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 +WrapLimitsVacuumLock 46 +NotifyQueueTailLock 47 diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 2e55368061..3b5627d887 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -26,6 +26,7 @@ const char *const LockTagTypeNames[] = { "relation", "extend", + "frozenid", "page", "tuple", "transactionid", @@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; break; + case LOCKTAG_DATABASE_FROZEN_IDS: + values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 8288e7d505..0a889e718f 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode); extern int RelationExtensionLockWaiterCount(Relation relation); +/* Lock to recompute pg_database.datfrozenxid in the current database */ +extern void LockDatabaseFrozenIds(LOCKMODE lockmode); + /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index cc9d8f7136..f425c3b56e 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -142,6 +142,8 @@ typedef enum LockTagType /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ /* same ID info as RELATION */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ + /* ID info for frozen IDs is DB OID */ LOCKTAG_PAGE, /* one page of a relation */ /* ID info for a page is RELATION info + BlockNumber */ LOCKTAG_TUPLE, /* one physical tuple */ @@ -207,6 +209,14 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ ((locktag).locktag_field1 = (dboid), \ (locktag).locktag_field2 = (reloid), \ |