diff options
author | Pavan Deolasee | 2015-12-09 07:23:40 +0000 |
---|---|---|
committer | Pavan Deolasee | 2015-12-09 07:23:40 +0000 |
commit | 1db1ebd18c586bc9967b45ab7d70005876fb807a (patch) | |
tree | cee10b7681aa3837d36283c3e0d2c840edee5936 | |
parent | 1e0c4f448331ef92460d42ea71c3a28c42a18ed3 (diff) |
Introduce a cluster monitor lock to avoid a race condition between spanshot
fetch and xmin reporting
-rw-r--r-- | src/backend/postmaster/clustermon.c | 32 | ||||
-rw-r--r-- | src/backend/storage/ipc/procarray.c | 72 | ||||
-rw-r--r-- | src/include/postmaster/clustermon.h | 2 | ||||
-rw-r--r-- | src/include/storage/lwlock.h | 13 |
4 files changed, 88 insertions, 31 deletions
diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c index 3cb7947354..f2148667e2 100644 --- a/src/backend/postmaster/clustermon.c +++ b/src/backend/postmaster/clustermon.c @@ -60,6 +60,7 @@ static void cm_sighup_handler(SIGNAL_ARGS); static void cm_sigterm_handler(SIGNAL_ARGS); static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin); static GlobalTransactionId ClusterMonitorGetReportedGlobalXmin(void); +static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin); /* PID of clustser monitoring process */ int ClusterMonitorPid = 0; @@ -203,7 +204,10 @@ ClusterMonitorInit(void) * interval. Keep doing this forever */ lastGlobalXmin = ClusterMonitorGetGlobalXmin(); + LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE); oldestXmin = GetOldestXminInternal(NULL, false, true, lastGlobalXmin); + ClusterMonitorSetReportingGlobalXmin(oldestXmin); + LWLockRelease(ClusterMonitorLock); if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin, &latestCompletedXid))) @@ -246,6 +250,8 @@ ClusterMonitorInit(void) ClusterMonitorSetGlobalXmin(newOldestXmin); } + ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId); + /* * Repeat at every 30 seconds */ @@ -373,6 +379,9 @@ ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin) ClusterMonitorCtl->gtm_recent_global_xmin = xmin; SpinLockRelease(&ClusterMonitorCtl->mutex); + if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xmin)) + ShmemVariableCache->latestCompletedXid = xmin; + LWLockRelease(ProcArrayLock); } @@ -398,3 +407,26 @@ ClusterMonitorGetReportedGlobalXmin(void) return reported_xmin; } + +static void +ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin) +{ + elog(DEBUG2, "ClusterMonitorSetReportingGlobalXmin - old %d, new %d", + ClusterMonitorCtl->reporting_recent_global_xmin, + xmin); + SpinLockAcquire(&ClusterMonitorCtl->mutex); + ClusterMonitorCtl->reporting_recent_global_xmin = xmin; + SpinLockRelease(&ClusterMonitorCtl->mutex); +} + +GlobalTransactionId +ClusterMonitorGetReportingGlobalXmin(void) +{ + GlobalTransactionId reporting_xmin; + + SpinLockAcquire(&ClusterMonitorCtl->mutex); + reporting_xmin = ClusterMonitorCtl->reporting_recent_global_xmin; + SpinLockRelease(&ClusterMonitorCtl->mutex); + + return reporting_xmin; +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 3209679779..1407bfc07d 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -68,6 +68,7 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "miscadmin.h" +#include "postmaster/clustermon.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/spin.h" @@ -1319,7 +1320,7 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal, /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = pgxact->xid; #ifdef XCP - TransactionId xmin; + TransactionId xmin = pgxact->xmin; /* Fetch just once */ #endif /* First consider the transaction's own Xid, if any */ @@ -1335,7 +1336,10 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal, * Xid, that could determine some not-yet-set Xmin. */ #ifdef XCP - xmin = pgxact->xmin; /* Fetch just once */ + + elog(DEBUG3, "proc: pid:%d, xmin: %d, xid: %d", proc->pid, + xmin, xid); + if (TransactionIdIsNormal(xmin) && TransactionIdPrecedes(xmin, result)) result = xmin; @@ -3174,44 +3178,57 @@ static void GetSnapshotDataFromGTM(Snapshot snapshot) { GTM_Snapshot gtm_snapshot; + GlobalTransactionId reporting_xmin; bool canbe_grouped = (!FirstSnapshotSet) || (!IsolationUsesXactSnapshot()); + bool xmin_changed = false; /* - * A transaction requesting a snapshot typically does not need an XID - * assigned to it and we recently made a provision for backends to obtain - * snapshots from the GTM without first obtaining an XID. But this lead to - * an interesting situation. - * - * If a backend can request a snapshot without associated XID, GTM has no - * information about such transactions or snapshots (we don't report BEGIN - * TRANSACTION * to GTM until an XID is assigned). So it may advance - * RecentGlobalXmin beyond the horizon mandated by a snapshot currently - * being used by a backend. That would most likely result in tuples being - * removed while they are still visible to an on-going scan. + * We never want to use a snapshot whose xmin is older than the + * RecentGlobalXmin computed by the GTM. While it does not look likely that + * that this will ever happen because both these computations happen on the + * GTM, we are still worried about a race condition where a backend sends a + * snapshot request, and before snapshot is received, the cluster monitor + * reports our Xmin (which obviously does not include this snapshot's + * xmin). Now if GTM processes the snapshot request first, computes + * snapshot's xmin and then receives our Xmin-report, it may actually moves + * RecentGlobalXmin beyond snapshot's xmin assuming some transactions + * finished in between. * - * Ideally we need a mechanism for GTM to track active snapshots and ensure - * that RecentGlobalXmin does not go past the lowest xmins of all such - * snapshots. But since we currently don't have a mechanism to do so, we - * force an XID assignment for this transaction which then acts as an - * anchor to track xmin at the GTM. + * We try to introduce some interlock between the Xmin reporting and + * snapshot request. Since we don't want to wait on a lock while Xmin is + * being reported by the cluster monitor process, we just make sure that + * the snapshot's xmin is not older than the Xmin we are currently + * reporting. Given that this is a very rare possibility, we just get a + * fresh snapshot from the GTM. * - * XXX We do it only when snapshots are obtained on a local coordinator. - * That means for cases where a datanode or a remote coordinator gets a - * direct snapshot from the GTM, it is still vulerable to tuples getting - * removed underneath a scan. But the requirements for a direct datanode - * snapshot are limited such as pgxc_node catalog scan at connection - * establishment or auto-analyze scans. Nevertheless, we MUST fix this - * before going to production release - */ - gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionIdIfAny(), canbe_grouped); + */ + + LWLockAcquire(ClusterMonitorLock, LW_SHARED); +retry: + reporting_xmin = ClusterMonitorGetReportingGlobalXmin(); + xmin_changed = false; + if (TransactionIdIsValid(reporting_xmin) && + !TransactionIdIsValid(MyPgXact->xmin)) + { + MyPgXact->xmin = reporting_xmin; + xmin_changed = true; + } + + gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionIdIfAny(), canbe_grouped); + if (!gtm_snapshot) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("GTM error, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess()))); else { + if (xmin_changed) + MyPgXact->xmin = InvalidTransactionId; + if (TransactionIdPrecedes(gtm_snapshot->sn_xmin, reporting_xmin)) + goto retry; + /* * Set RecentGlobalXmin by copying from the shared memory state * maintained by the Clutser Monitor @@ -3227,6 +3244,7 @@ GetSnapshotDataFromGTM(Snapshot snapshot) gtm_snapshot->sn_xcnt, gtm_snapshot->sn_xip, SNAPSHOT_DIRECT); GetSnapshotFromGlobalSnapshot(snapshot); } + LWLockRelease(ClusterMonitorLock); } static void diff --git a/src/include/postmaster/clustermon.h b/src/include/postmaster/clustermon.h index 0c97fa9fdd..2cd0aefc01 100644 --- a/src/include/postmaster/clustermon.h +++ b/src/include/postmaster/clustermon.h @@ -20,6 +20,7 @@ typedef struct { slock_t mutex; GlobalTransactionId reported_recent_global_xmin; + GlobalTransactionId reporting_recent_global_xmin; GlobalTransactionId gtm_recent_global_xmin; } ClusterMonitorCtlData; @@ -32,6 +33,7 @@ extern bool IsClusterMonitorProcess(void); extern int StartClusterMonitor(void); GlobalTransactionId ClusterMonitorGetGlobalXmin(void); void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin); +GlobalTransactionId ClusterMonitorGetReportingXmin(void); #ifdef EXEC_BACKEND extern void ClusterMonitorIAm(void); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b705a593ee..68bd919d0e 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -141,13 +141,18 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray; #define BarrierLock (&MainLWLockArray[38].lock) #define NodeTableLock (&MainLWLockArray[39].lock) #define SQueuesLock (&MainLWLockArray[40].lock) +#define ClusterMonitorLock (&MainLWLockArray[41].lock) +#define CommitTsControlLock (&MainLWLockArray[42].lock) +#define CommitTsLock (&MainLWLockArray[43].lock) +#define ReplicationOriginLock (&MainLWLockArray[44].lock) +#else +#define CommitTsControlLock (&MainLWLockArray[38].lock) +#define CommitTsLock (&MainLWLockArray[39].lock) +#define ReplicationOriginLock (&MainLWLockArray[40].lock) #endif -#define CommitTsControlLock (&MainLWLockArray[41].lock) -#define CommitTsLock (&MainLWLockArray[42].lock) -#define ReplicationOriginLock (&MainLWLockArray[43].lock) #ifdef PGXC -#define NUM_INDIVIDUAL_LWLOCKS 44 +#define NUM_INDIVIDUAL_LWLOCKS 45 #else #define NUM_INDIVIDUAL_LWLOCKS 41 #endif |