summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Misch2020-08-15 17:15:53 +0000
committerNoah Misch2020-08-15 17:15:57 +0000
commite525770dd504e5f182229b9bee64430ebe71a2a8 (patch)
treee51aa6374e2eefc8b1fbab6e95ed5559b5f1c661
parentdea07098af20f3dbaac873263020a5d95ea24ed8 (diff)
Prevent concurrent SimpleLruTruncate() for any given SLRU.
The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/[email protected]
-rw-r--r--doc/src/sgml/catalogs.sgml4
-rw-r--r--doc/src/sgml/monitoring.sgml20
-rw-r--r--src/backend/access/transam/slru.c8
-rw-r--r--src/backend/access/transam/subtrans.c4
-rw-r--r--src/backend/commands/async.c37
-rw-r--r--src/backend/commands/vacuum.c13
-rw-r--r--src/backend/storage/lmgr/lmgr.c20
-rw-r--r--src/backend/storage/lmgr/lwlock.c2
-rw-r--r--src/backend/storage/lmgr/lwlocknames.txt2
-rw-r--r--src/backend/utils/adt/lockfuncs.c12
-rw-r--r--src/include/storage/lmgr.h3
-rw-r--r--src/include/storage/lock.h10
12 files changed, 119 insertions, 16 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index f314f93051..96e55193f1 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -8886,7 +8886,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
and general database objects (identified by class OID and object OID,
in the same way as in <structname>pg_description</structname> or
<structname>pg_depend</structname>). Also, the right to extend a
- relation is represented as a separate lockable object.
+ relation is represented as a separate lockable object, as is the right to
+ update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
Also, <quote>advisory</> locks can be taken on numbers that have
user-defined meanings.
</para>
@@ -8912,6 +8913,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
Type of the lockable object:
<literal>relation</>,
<literal>extend</>,
+ <literal>frozenid</literal>,
<literal>page</>,
<literal>tuple</>,
<literal>transactionid</>,
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index c492114f33..601616ec2c 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -845,7 +845,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<tbody>
<row>
- <entry morerows="62"><literal>LWLock</></entry>
+ <entry morerows="64"><literal>LWLock</></entry>
<entry><literal>ShmemIndexLock</></entry>
<entry>Waiting to find or allocate space in shared memory.</entry>
</row>
@@ -1044,6 +1044,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
the oldest transaction id available to it.</entry>
</row>
<row>
+ <entry><literal>WrapLimitsVacuumLock</literal></entry>
+ <entry>Waiting to update limits on transaction id and multixact
+ consumption.</entry>
+ </row>
+ <row>
+ <entry><literal>NotifyQueueTailLock</literal></entry>
+ <entry>Waiting to update limit on notification message
+ storage.</entry>
+ </row>
+ <row>
<entry><literal>clog</></entry>
<entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
</row>
@@ -1118,7 +1128,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting for TBM shared iterator lock.</entry>
</row>
<row>
- <entry morerows="9"><literal>Lock</></entry>
+ <entry morerows="10"><literal>Lock</></entry>
<entry><literal>relation</></entry>
<entry>Waiting to acquire a lock on a relation.</entry>
</row>
@@ -1127,6 +1137,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting to extend a relation.</entry>
</row>
<row>
+ <entry><literal>frozenid</literal></entry>
+ <entry>Waiting to
+ update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
+ and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
+ </row>
+ <row>
<entry><literal>page</></entry>
<entry>Waiting to acquire a lock on page of a relation.</entry>
</row>
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 56ca77c407..c7e3a76b35 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1164,6 +1164,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
/*
* Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint. Mutual exclusion must begin
+ * before computing cutoffPage. Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage. Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
*/
void
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index f640661130..cfcab811a8 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
/*
* Remove all SUBTRANS segments before the one holding the passed transaction ID
*
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction. This
+ * is called only during checkpoint.
*/
void
TruncateSUBTRANS(TransactionId oldestXact)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 9b5273a6de..3d1f60c632 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
/*
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
*
- * The AsyncQueueControl structure is protected by the AsyncQueueLock.
+ * The AsyncQueueControl structure is protected by the AsyncQueueLock and
+ * NotifyQueueTailLock.
*
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding AsyncQueueLock in SHARED mode, backends may only inspect their
+ * own entries as well as the head and tail pointers. Consequently we can
+ * allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
*
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
+ * change the tail pointer.
*
* AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get AsyncQueueLock and then AsyncCtlLock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
*
* Each backend uses the backend[] array entry with index equal to its
* BackendId (which can range from 1 to MaxBackends). We rely on this to make
@@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
int newtailpage;
int boundary;
+ /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+ LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+ /* Compute the new tail. */
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
min = QUEUE_HEAD;
for (i = 1; i <= MaxBackends; i++)
@@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
}
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
- QUEUE_TAIL = min;
LWLockRelease(AsyncQueueLock);
/*
@@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
*/
SimpleLruTruncate(AsyncCtl, newtailpage);
}
+
+ /*
+ * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
+ * the segment immediately prior to the new tail, allowing fresh data into
+ * that segment.
+ */
+ LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
+ QUEUE_TAIL = min;
+ LWLockRelease(AsyncQueueLock);
+
+ LWLockRelease(NotifyQueueTailLock);
}
/*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 83e807e32a..fa5a3a34bf 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -934,6 +934,14 @@ vac_update_datfrozenxid(void)
bool dirty = false;
/*
+ * Restrict this task to one backend per database. This avoids race
+ * conditions that would move datfrozenxid or datminmxid backward. It
+ * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+ * datfrozenxid passed to an earlier vac_truncate_clog() call.
+ */
+ LockDatabaseFrozenIds(ExclusiveLock);
+
+ /*
* Initialize the "min" calculation with GetOldestXmin, which is a
* reasonable approximation to the minimum relfrozenxid for not-yet-
* committed pg_class entries for new tables; see AddNewRelationTuple().
@@ -1097,6 +1105,9 @@ vac_truncate_clog(TransactionId frozenXID,
bool bogus = false;
bool frozenAlreadyWrapped = false;
+ /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+ LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
/* init oldest datoids to sync with my frozenXID/minMulti values */
oldestxid_datoid = MyDatabaseId;
minmulti_datoid = MyDatabaseId;
@@ -1206,6 +1217,8 @@ vac_truncate_clog(TransactionId frozenXID,
*/
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+ LWLockRelease(WrapLimitsVacuumLock);
}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 10e2d028fa..9510442628 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -413,6 +413,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
}
/*
+ * LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
* LockPage
*
* Obtain a page-level lock. This is currently used by some index access
@@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
tag->locktag_field2,
tag->locktag_field1);
break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ appendStringInfo(buf,
+ _("pg_database.datfrozenxid of database %u"),
+ tag->locktag_field1);
+ break;
case LOCKTAG_PAGE:
appendStringInfo(buf,
_("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index f40af6a1d4..1965e093b0 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -495,7 +495,7 @@ RegisterLWLockTranches(void)
if (LWLockTrancheArray == NULL)
{
- LWLockTranchesAllocated = 64;
+ LWLockTranchesAllocated = 128;
LWLockTrancheArray = (char **)
MemoryContextAllocZero(TopMemoryContext,
LWLockTranchesAllocated * sizeof(char *));
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6025ecedb..04a1786d37 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,5 @@ OldSnapshotTimeMapLock 42
BackendRandomLock 43
LogicalRepWorkerLock 44
CLogTruncationLock 45
+WrapLimitsVacuumLock 46
+NotifyQueueTailLock 47
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index 9e0a8ab79d..4ed5bf15f9 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -26,6 +26,7 @@
const char *const LockTagTypeNames[] = {
"relation",
"extend",
+ "frozenid",
"page",
"tuple",
"transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
nulls[8] = true;
nulls[9] = true;
break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+ nulls[2] = true;
+ nulls[3] = true;
+ nulls[4] = true;
+ nulls[5] = true;
+ nulls[6] = true;
+ nulls[7] = true;
+ nulls[8] = true;
+ nulls[9] = true;
+ break;
case LOCKTAG_PAGE:
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 0b923227a2..a37107903a 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
LOCKMODE lockmode);
extern int RelationExtensionLockWaiterCount(Relation relation);
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
/* Lock a page (currently only used within indexes) */
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index 1cd3685fae..cf1bed6ea4 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -141,6 +141,8 @@ typedef enum LockTagType
/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
/* same ID info as RELATION */
+ LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
+ /* ID info for frozen IDs is DB OID */
LOCKTAG_PAGE, /* one page of a relation */
/* ID info for a page is RELATION info + BlockNumber */
LOCKTAG_TUPLE, /* one physical tuple */
@@ -206,6 +208,14 @@ typedef struct LOCKTAG
(locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = 0, \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
((locktag).locktag_field1 = (dboid), \
(locktag).locktag_field2 = (reloid), \