diff options
author | Pavan Deolasee | 2016-02-11 07:58:11 +0000 |
---|---|---|
committer | Pavan Deolasee | 2016-10-18 09:57:45 +0000 |
commit | 56c8520aee868cbb708d5869efab5eeae248d425 (patch) | |
tree | 14a4d54ed9f01fc44d220a368a050fd15cd024f6 | |
parent | 48a9e0ab34d94064f9dec11149811e7bd880945d (diff) |
Improve node exclusion and node rejoining logic for calculation for global
xmin.
When a node rejoins the cluster, after disconnection or restart, the logic now
accounts for the fact that the node might be running with an older xmin. GTM
sends back appropriate error codes and recent state information so that the
node can make a decision to join the cluster or fail.
Also increase the threshold for delay in reporting to 10mins to avoid false
positives.
-rw-r--r-- | src/backend/access/transam/gtm.c | 8 | ||||
-rw-r--r-- | src/backend/postmaster/clustermon.c | 25 | ||||
-rw-r--r-- | src/backend/storage/ipc/procarray.c | 5 | ||||
-rw-r--r-- | src/gtm/main/gtm_txn.c | 6 | ||||
-rw-r--r-- | src/gtm/recovery/register_common.c | 72 | ||||
-rw-r--r-- | src/include/gtm/register.h | 1 |
6 files changed, 82 insertions, 35 deletions
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 26defd944e..64f39d19ad 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -704,10 +704,8 @@ ReportGlobalXmin(GlobalTransactionId gxid, GlobalTransactionId *global_xmin, if (!conn) return EOF; - if (report_global_xmin(conn, PGXCNodeName, + report_global_xmin(conn, PGXCNodeName, IS_PGXC_COORDINATOR ? GTM_NODE_COORDINATOR : GTM_NODE_DATANODE, - gxid, global_xmin, latest_completed_xid, &errcode)) - return errcode; - else - return 0; + gxid, global_xmin, latest_completed_xid, &errcode); + return errcode; } diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c index f2148667e2..2ff09787ed 100644 --- a/src/backend/postmaster/clustermon.c +++ b/src/backend/postmaster/clustermon.c @@ -212,8 +212,11 @@ ClusterMonitorInit(void) if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin, &latestCompletedXid))) { - elog(DEBUG2, "Failed to report RecentGlobalXmin to GTM - %d:%d", - status, newOldestXmin); + elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin " + "- reported RecentGlobalXmin %d, received " + "RecentGlobalXmin %d, " "received latestCompletedXid", + status, oldestXmin, newOldestXmin, + latestCompletedXid); if (status == GTM_ERRCODE_TOO_OLD_XMIN || status == GTM_ERRCODE_NODE_EXCLUDED) { @@ -238,14 +241,17 @@ ClusterMonitorInit(void) SetLatestCompletedXid(latestCompletedXid); continue; } - elog(PANIC, "Global xmin computation mismatch"); } } else { + elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d," + "received RecentGlobalXmin %d, " + "received latestCompletedXid %d", oldestXmin, + newOldestXmin, latestCompletedXid); + SetLatestCompletedXid(latestCompletedXid); ClusterMonitorSetReportedGlobalXmin(oldestXmin); - elog(DEBUG2, "Updating global_xmin to %d", newOldestXmin); if (GlobalTransactionIdIsValid(newOldestXmin)) ClusterMonitorSetGlobalXmin(newOldestXmin); } @@ -373,15 +379,20 @@ void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin) { LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Do a consistency check to ensure that we NEVER have running transactions + * with xmin less than what the GTM has already computed. While during + * normal execution, this should never happen, if we ever been excluded + * from the xmin calculation by the GTM while we are still running old + * transactions, PANIC is our best bet to avoid corruption + */ ProcArrayCheckXminConsistency(xmin); SpinLockAcquire(&ClusterMonitorCtl->mutex); ClusterMonitorCtl->gtm_recent_global_xmin = xmin; SpinLockRelease(&ClusterMonitorCtl->mutex); - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xmin)) - ShmemVariableCache->latestCompletedXid = xmin; - LWLockRelease(ProcArrayLock); } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 55456cd3be..216891485b 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1302,6 +1302,9 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal, TransactionIdAdvance(result); #endif + elog(DEBUG1, "GetOldestXminInternal - Starting computation with" + "latestCompletedXid %d + 1", result); + for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; @@ -1342,7 +1345,7 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal, */ #ifdef XCP - elog(DEBUG3, "proc: pid:%d, xmin: %d, xid: %d", proc->pid, + elog(DEBUG1, "proc: pid:%d, xmin: %d, xid: %d", proc->pid, xmin, xid); if (TransactionIdIsNormal(xmin) && diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index f6453532a6..bc678f9770 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -565,7 +565,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count, if (GlobalTransactionIdIsValid(gtm_txninfo->gti_gxid)) { gxid[ii] = gtm_txninfo->gti_gxid; - elog(DEBUG2, "GTM_TransactionInfo has XID already assgined - %s:%d", + elog(DEBUG1, "GTM_TransactionInfo has XID already assgined - %s:%d", gtm_txninfo->gti_global_session_id, gxid[ii]); continue; } @@ -604,7 +604,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count, GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid); - elog(DEBUG2, "Assigning new transaction ID = %s:%d", + elog(DEBUG1, "Assigning new transaction ID = %s:%d", gtm_txninfo->gti_global_session_id, xid); gxid[ii] = gtm_txninfo->gti_gxid = xid; new_handle[*new_txn_count] = gtm_txninfo->gti_handle; @@ -725,7 +725,7 @@ GTM_BeginTransactionMulti(GTM_IsolationLevel isolevel[], if (txn != InvalidTransactionHandle) { gtm_txninfo[kk] = GTM_HandleToTransactionInfo(txn); - elog(DEBUG2, "Existing transaction found: %s:%d", + elog(DEBUG1, "Existing transaction found: %s:%d", gtm_txninfo[kk]->gti_global_session_id, gtm_txninfo[kk]->gti_gxid); txns[kk] = txn; diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c index f8d2748a77..ebca82b828 100644 --- a/src/gtm/recovery/register_common.c +++ b/src/gtm/recovery/register_common.c @@ -941,9 +941,9 @@ GTM_InitNodeManager(void) } /* - * Set to 120 seconds, but should be a few multiple for cluster monitor naptime + * Set to 600 seconds, but should be a few multiple for cluster monitor naptime */ -#define GTM_REPORT_XMIN_DELAY_THRESHOLD (120 * 1000) +#define GTM_REPORT_XMIN_DELAY_THRESHOLD (600 * 1000) GlobalTransactionId GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, @@ -988,20 +988,31 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, * get opportunity to report xmin in a timely fashion, we shouldn't get * into this situation often. * - * The exception to this rule is that if the remote node is idle, then we - * actually ignore the xmin reported by it and instead calculate a new xmin - * for it and send it back in respone. The remote node will still done - * final sanity check and either accept that xmin or kill itself via PANIC - * mechanism. */ if ((mynodeinfo->excluded) && - GlobalTransactionIdPrecedes(mynodeinfo->reported_xmin, - GTM_GlobalXmin)) + GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin)) { *errcode = GTM_ERRCODE_NODE_EXCLUDED; + + /* + * This node is joining back the cluster after being excluded from the + * GTM_GlobalXmin calculation because of timeout, disconnection or node + * failure. In such cases, we send appropriate error back to the node + * and let it handle the situation. To ensure that our GTM_GlobalXmin + * does not keep advancing while the node is trying to join back the + * cluster, we temporarily set reported_xmin to the current + * GTM_GlobalXmin and wait to see if the node finally catches up. + * + * Note: If the node had old transaction running while it was excluded + * by the GTM, it will fail the consistency checks and restart itself. + */ + mynodeinfo->joining = true; + mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent(); + mynodeinfo->reported_xmin = GTM_GlobalXmin; + GTM_RWLockRelease(&mynodeinfo->node_lock); elog(LOG, "GTM_ERRCODE_NODE_EXCLUDED - node_name %s, reported_xmin %d " - "previously reported_xmin, GTM_GlobalXmin %d", node_name, + "previously reported_xmin %d, GTM_GlobalXmin %d", node_name, reported_xmin, mynodeinfo->reported_xmin, GTM_GlobalXmin); @@ -1009,13 +1020,18 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, } /* - * The remote node must not report a xmin which precedes the xmin it had - * reported in the past. If it ever happens, send an error back and let the - * remote node restart itself + * The remote node must not report a xmin which precedes the GTM_GlobalXmin + * we have already computed. If it ever happens, send an error back and let + * the remote node handle it, possibly restarting itself */ - if (GlobalTransactionIdPrecedes(reported_xmin, mynodeinfo->reported_xmin)) + if (GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin)) { *errcode = GTM_ERRCODE_TOO_OLD_XMIN; + + mynodeinfo->joining = true; + mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent(); + mynodeinfo->reported_xmin = GTM_GlobalXmin; + GTM_RWLockRelease(&mynodeinfo->node_lock); /* @@ -1028,8 +1044,8 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, */ if (mynodeinfo->reported_xmin_time) elog(LOG, "GTM_ERRCODE_TOO_OLD_XMIN - node_name %s, reported_xmin %d, " - "previously reported_xmin %d", node_name, - reported_xmin, mynodeinfo->reported_xmin); + "previously reported_xmin %d, GTM_GlobalXmin %d", node_name, + reported_xmin, mynodeinfo->reported_xmin, GTM_GlobalXmin); return InvalidGlobalTransactionId; } @@ -1038,7 +1054,15 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, mynodeinfo->reported_xmin); mynodeinfo->reported_xmin = reported_xmin; - mynodeinfo->excluded = false; + + /* + * Node joined back, set both excluded and joining to false + */ + if (mynodeinfo->excluded) + { + mynodeinfo->excluded = false; + mynodeinfo->joining = false; + } mynodeinfo->reported_xmin_time = current_time = GTM_TimestampGetCurrent(); GTM_RWLockRelease(&mynodeinfo->node_lock); @@ -1060,13 +1084,22 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, { GTM_PGXCNodeInfo *nodeinfo = all_nodes[ii]; - elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %lld", + elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %ld", nodeinfo, nodeinfo->type, nodeinfo->excluded ? 'T' : 'F', nodeinfo->reported_xmin, nodeinfo->reported_xmin_time); - if (nodeinfo->excluded) + /* + * If a node has not reported its status for + * GTM_REPORT_XMIN_DELAY_THRESHOLD and neither in the process of + * rejoining the cluster, don't include it in the GTM_GlobalXmin + * calculation + */ + if (nodeinfo->excluded && !nodeinfo->joining) continue; + /* + * Care only for datanodes and coordinators + */ if (nodeinfo->type != GTM_NODE_COORDINATOR && nodeinfo->type != GTM_NODE_DATANODE) continue; @@ -1087,6 +1120,7 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name, current_time, GTM_REPORT_XMIN_DELAY_THRESHOLD)) { nodeinfo->excluded = true; + nodeinfo->joining = false; GTM_RWLockRelease(&nodeinfo->node_lock); continue; } diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h index 0212a9ecf8..a50ffe0c21 100644 --- a/src/include/gtm/register.h +++ b/src/include/gtm/register.h @@ -62,6 +62,7 @@ typedef struct GTM_PGXCNodeInfo * Has the node timed out and be * excluded from xmin computation? */ + bool joining; /* Is the node joining back */ bool idle; /* Has the node been idle since * last report */ |