summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavan Deolasee2015-12-09 07:23:40 +0000
committerPavan Deolasee2015-12-09 07:23:40 +0000
commit1db1ebd18c586bc9967b45ab7d70005876fb807a (patch)
treecee10b7681aa3837d36283c3e0d2c840edee5936
parent1e0c4f448331ef92460d42ea71c3a28c42a18ed3 (diff)
Introduce a cluster monitor lock to avoid a race condition between spanshot
fetch and xmin reporting
-rw-r--r--src/backend/postmaster/clustermon.c32
-rw-r--r--src/backend/storage/ipc/procarray.c72
-rw-r--r--src/include/postmaster/clustermon.h2
-rw-r--r--src/include/storage/lwlock.h13
4 files changed, 88 insertions, 31 deletions
diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c
index 3cb7947354..f2148667e2 100644
--- a/src/backend/postmaster/clustermon.c
+++ b/src/backend/postmaster/clustermon.c
@@ -60,6 +60,7 @@ static void cm_sighup_handler(SIGNAL_ARGS);
static void cm_sigterm_handler(SIGNAL_ARGS);
static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin);
static GlobalTransactionId ClusterMonitorGetReportedGlobalXmin(void);
+static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin);
/* PID of clustser monitoring process */
int ClusterMonitorPid = 0;
@@ -203,7 +204,10 @@ ClusterMonitorInit(void)
* interval. Keep doing this forever
*/
lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+ LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
oldestXmin = GetOldestXminInternal(NULL, false, true, lastGlobalXmin);
+ ClusterMonitorSetReportingGlobalXmin(oldestXmin);
+ LWLockRelease(ClusterMonitorLock);
if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
&latestCompletedXid)))
@@ -246,6 +250,8 @@ ClusterMonitorInit(void)
ClusterMonitorSetGlobalXmin(newOldestXmin);
}
+ ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
+
/*
* Repeat at every 30 seconds
*/
@@ -373,6 +379,9 @@ ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
SpinLockRelease(&ClusterMonitorCtl->mutex);
+ if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xmin))
+ ShmemVariableCache->latestCompletedXid = xmin;
+
LWLockRelease(ProcArrayLock);
}
@@ -398,3 +407,26 @@ ClusterMonitorGetReportedGlobalXmin(void)
return reported_xmin;
}
+
+static void
+ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin)
+{
+ elog(DEBUG2, "ClusterMonitorSetReportingGlobalXmin - old %d, new %d",
+ ClusterMonitorCtl->reporting_recent_global_xmin,
+ xmin);
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ ClusterMonitorCtl->reporting_recent_global_xmin = xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+}
+
+GlobalTransactionId
+ClusterMonitorGetReportingGlobalXmin(void)
+{
+ GlobalTransactionId reporting_xmin;
+
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ reporting_xmin = ClusterMonitorCtl->reporting_recent_global_xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+ return reporting_xmin;
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3209679779..1407bfc07d 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -68,6 +68,7 @@
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
+#include "postmaster/clustermon.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/spin.h"
@@ -1319,7 +1320,7 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
/* Fetch xid just once - see GetNewTransactionId */
TransactionId xid = pgxact->xid;
#ifdef XCP
- TransactionId xmin;
+ TransactionId xmin = pgxact->xmin; /* Fetch just once */
#endif
/* First consider the transaction's own Xid, if any */
@@ -1335,7 +1336,10 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
* Xid, that could determine some not-yet-set Xmin.
*/
#ifdef XCP
- xmin = pgxact->xmin; /* Fetch just once */
+
+ elog(DEBUG3, "proc: pid:%d, xmin: %d, xid: %d", proc->pid,
+ xmin, xid);
+
if (TransactionIdIsNormal(xmin) &&
TransactionIdPrecedes(xmin, result))
result = xmin;
@@ -3174,44 +3178,57 @@ static void
GetSnapshotDataFromGTM(Snapshot snapshot)
{
GTM_Snapshot gtm_snapshot;
+ GlobalTransactionId reporting_xmin;
bool canbe_grouped = (!FirstSnapshotSet) || (!IsolationUsesXactSnapshot());
+ bool xmin_changed = false;
/*
- * A transaction requesting a snapshot typically does not need an XID
- * assigned to it and we recently made a provision for backends to obtain
- * snapshots from the GTM without first obtaining an XID. But this lead to
- * an interesting situation.
- *
- * If a backend can request a snapshot without associated XID, GTM has no
- * information about such transactions or snapshots (we don't report BEGIN
- * TRANSACTION * to GTM until an XID is assigned). So it may advance
- * RecentGlobalXmin beyond the horizon mandated by a snapshot currently
- * being used by a backend. That would most likely result in tuples being
- * removed while they are still visible to an on-going scan.
+ * We never want to use a snapshot whose xmin is older than the
+ * RecentGlobalXmin computed by the GTM. While it does not look likely that
+ * that this will ever happen because both these computations happen on the
+ * GTM, we are still worried about a race condition where a backend sends a
+ * snapshot request, and before snapshot is received, the cluster monitor
+ * reports our Xmin (which obviously does not include this snapshot's
+ * xmin). Now if GTM processes the snapshot request first, computes
+ * snapshot's xmin and then receives our Xmin-report, it may actually moves
+ * RecentGlobalXmin beyond snapshot's xmin assuming some transactions
+ * finished in between.
*
- * Ideally we need a mechanism for GTM to track active snapshots and ensure
- * that RecentGlobalXmin does not go past the lowest xmins of all such
- * snapshots. But since we currently don't have a mechanism to do so, we
- * force an XID assignment for this transaction which then acts as an
- * anchor to track xmin at the GTM.
+ * We try to introduce some interlock between the Xmin reporting and
+ * snapshot request. Since we don't want to wait on a lock while Xmin is
+ * being reported by the cluster monitor process, we just make sure that
+ * the snapshot's xmin is not older than the Xmin we are currently
+ * reporting. Given that this is a very rare possibility, we just get a
+ * fresh snapshot from the GTM.
*
- * XXX We do it only when snapshots are obtained on a local coordinator.
- * That means for cases where a datanode or a remote coordinator gets a
- * direct snapshot from the GTM, it is still vulerable to tuples getting
- * removed underneath a scan. But the requirements for a direct datanode
- * snapshot are limited such as pgxc_node catalog scan at connection
- * establishment or auto-analyze scans. Nevertheless, we MUST fix this
- * before going to production release
- */
- gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionIdIfAny(), canbe_grouped);
+ */
+
+ LWLockAcquire(ClusterMonitorLock, LW_SHARED);
+retry:
+ reporting_xmin = ClusterMonitorGetReportingGlobalXmin();
+ xmin_changed = false;
+ if (TransactionIdIsValid(reporting_xmin) &&
+ !TransactionIdIsValid(MyPgXact->xmin))
+ {
+ MyPgXact->xmin = reporting_xmin;
+ xmin_changed = true;
+ }
+
+ gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionIdIfAny(), canbe_grouped);
+
if (!gtm_snapshot)
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("GTM error, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess())));
else
{
+ if (xmin_changed)
+ MyPgXact->xmin = InvalidTransactionId;
+ if (TransactionIdPrecedes(gtm_snapshot->sn_xmin, reporting_xmin))
+ goto retry;
+
/*
* Set RecentGlobalXmin by copying from the shared memory state
* maintained by the Clutser Monitor
@@ -3227,6 +3244,7 @@ GetSnapshotDataFromGTM(Snapshot snapshot)
gtm_snapshot->sn_xcnt, gtm_snapshot->sn_xip, SNAPSHOT_DIRECT);
GetSnapshotFromGlobalSnapshot(snapshot);
}
+ LWLockRelease(ClusterMonitorLock);
}
static void
diff --git a/src/include/postmaster/clustermon.h b/src/include/postmaster/clustermon.h
index 0c97fa9fdd..2cd0aefc01 100644
--- a/src/include/postmaster/clustermon.h
+++ b/src/include/postmaster/clustermon.h
@@ -20,6 +20,7 @@ typedef struct
{
slock_t mutex;
GlobalTransactionId reported_recent_global_xmin;
+ GlobalTransactionId reporting_recent_global_xmin;
GlobalTransactionId gtm_recent_global_xmin;
} ClusterMonitorCtlData;
@@ -32,6 +33,7 @@ extern bool IsClusterMonitorProcess(void);
extern int StartClusterMonitor(void);
GlobalTransactionId ClusterMonitorGetGlobalXmin(void);
void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin);
+GlobalTransactionId ClusterMonitorGetReportingXmin(void);
#ifdef EXEC_BACKEND
extern void ClusterMonitorIAm(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index b705a593ee..68bd919d0e 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -141,13 +141,18 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
#define BarrierLock (&MainLWLockArray[38].lock)
#define NodeTableLock (&MainLWLockArray[39].lock)
#define SQueuesLock (&MainLWLockArray[40].lock)
+#define ClusterMonitorLock (&MainLWLockArray[41].lock)
+#define CommitTsControlLock (&MainLWLockArray[42].lock)
+#define CommitTsLock (&MainLWLockArray[43].lock)
+#define ReplicationOriginLock (&MainLWLockArray[44].lock)
+#else
+#define CommitTsControlLock (&MainLWLockArray[38].lock)
+#define CommitTsLock (&MainLWLockArray[39].lock)
+#define ReplicationOriginLock (&MainLWLockArray[40].lock)
#endif
-#define CommitTsControlLock (&MainLWLockArray[41].lock)
-#define CommitTsLock (&MainLWLockArray[42].lock)
-#define ReplicationOriginLock (&MainLWLockArray[43].lock)
#ifdef PGXC
-#define NUM_INDIVIDUAL_LWLOCKS 44
+#define NUM_INDIVIDUAL_LWLOCKS 45
#else
#define NUM_INDIVIDUAL_LWLOCKS 41
#endif