summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavan Deolasee2016-01-22 02:49:51 +0000
committerPavan Deolasee2016-10-18 09:45:32 +0000
commit6474f98a6a8e78c3d89e4db93668cce40910c7fc (patch)
treeeddc0ad2570b4860957dd4c9dd404292c10d6ee7
parent05800170af0e10e78712ff9dfc62940eb42baa97 (diff)
Recheck health of a node before changing its status.
send/recv() errors just give us a hint about something going wrong with a node. But a mere send/recv failure does not mean that the node is down or unreachable. So before changing the health status, ping the node once and confirm its health status.
-rw-r--r--src/backend/optimizer/util/pathnode.c2
-rw-r--r--src/backend/optimizer/util/pgxcship.c2
-rw-r--r--src/backend/pgxc/pool/pgxcnode.c35
-rw-r--r--src/backend/pgxc/pool/poolmgr.c47
-rw-r--r--src/include/pgxc/poolmgr.h1
5 files changed, 64 insertions, 23 deletions
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5f07ac0611..d42723455a 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -894,7 +894,7 @@ retry_pools:
int i;
bool healthmap[MaxDataNodes];
- PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap);
+ PgxcNodeDnListHealth(rel_loc_info->nodeList, healthmap);
i = 0;
foreach(lc, rel_loc_info->nodeList)
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index fc1d189c9b..2e4e940413 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -492,7 +492,7 @@ retry_pools:
ListCell *lc;
bool healthmap[MaxDataNodes];
- PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap);
+ PgxcNodeDnListHealth(rel_loc_info->nodeList, healthmap);
i = 0;
foreach(lc, rel_loc_info->nodeList)
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 05efd4e675..6882d59780 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -548,17 +548,14 @@ retry:
/*
* before returning, also update the shared health
* status field to indicate that this node could be
- * possibly unavailable
+ * possibly unavailable.
+ *
+ * Note that this error could be due to a stale handle
+ * and it's possible that another backend might have
+ * already updated the health status OR the node
+ * might have already come back since the last disruption
*/
- if (!PgxcNodeUpdateHealth(conn->nodeoid, false))
- elog(WARNING, "Could not update health status of node %u",
- conn->nodeoid);
- else
- elog(WARNING, "Health map updated to reflect DOWN node (%u)",
- conn->nodeoid);
-
- /* But ping once to see if the node is still available */
- PoolPingNodes();
+ PoolPingNodeRecheck(conn->nodeoid);
/* Should we read from the other connections before returning? */
return ERROR_OCCURED;
@@ -1631,17 +1628,15 @@ pgxc_node_flush(PGXCNodeHandle *handle)
/*
* before returning, also update the shared health
- * status field to indicate that this node is down
+ * status field to indicate that this node could be
+ * possibly unavailable.
+ *
+ * Note that this error could be due to a stale handle
+ * and it's possible that another backend might have
+ * already updated the health status OR the node
+ * might have already come back since the last disruption
*/
- if (!PgxcNodeUpdateHealth(handle->nodeoid, false))
- elog(WARNING, "Could not update health status of node %u",
- handle->nodeoid);
- else
- elog(WARNING, "Health map updated to reflect DOWN node (%u)",
- handle->nodeoid);
-
- /* But ping once to see if the node is still available */
- PoolPingNodes();
+ PoolPingNodeRecheck(handle->nodeoid);
return EOF;
}
}
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index fb07018500..3dac376f05 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -755,7 +755,7 @@ TryPingUnhealthyNode(Oid nodeoid)
{
int status;
NodeDefinition *nodeDef;
- char connstr[1024];
+ char connstr[MAXPGPATH * 2 + 256];
nodeDef = PgxcNodeGetDefinition(nodeoid);
if (nodeDef == NULL)
@@ -798,6 +798,51 @@ TryPingUnhealthyNode(Oid nodeoid)
}
/*
+ * Check if a node is indeed down and if it is update its UNHEALTHY
+ * status
+ */
+void
+PoolPingNodeRecheck(Oid nodeoid)
+{
+ int status;
+ NodeDefinition *nodeDef;
+ char connstr[MAXPGPATH * 2 + 256];
+ bool healthy;
+
+ nodeDef = PgxcNodeGetDefinition(nodeoid);
+ if (nodeDef == NULL)
+ {
+ /* No such definition, node dropped? */
+ elog(DEBUG1, "Could not find node (%u) definition,"
+ " skipping health check", nodeoid);
+ return;
+ }
+
+ sprintf(connstr,
+ "host=%s port=%d", NameStr(nodeDef->nodehost),
+ nodeDef->nodeport);
+ status = PGXCNodePing(connstr);
+ healthy = (status == 0);
+
+ /* if no change in health bit, return */
+ if (healthy == nodeDef->nodeishealthy)
+ {
+ pfree(nodeDef);
+ return;
+ }
+
+ if (!PgxcNodeUpdateHealth(nodeoid, healthy))
+ elog(WARNING, "Could not update health status of node (%s)",
+ NameStr(nodeDef->nodename));
+ else
+ elog(LOG, "Health map updated to reflect (%s) node (%s)",
+ healthy ? "HEALTHY" : "UNHEALTHY", NameStr(nodeDef->nodename));
+ pfree(nodeDef);
+
+ return;
+}
+
+/*
* Ping UNHEALTHY nodes as part of the maintenance window
*/
void
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 38d45cf255..e9f5bbd189 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -142,6 +142,7 @@ extern void PoolManagerLock(bool is_lock);
/* Do pool health check activity */
extern void PoolPingNodes(void);
+extern void PoolPingNodeRecheck(Oid nodeoid);
extern bool check_persistent_connections(bool *newval, void **extra,
GucSource source);