diff options
author | Pavan Deolasee | 2016-01-22 02:49:51 +0000 |
---|---|---|
committer | Pavan Deolasee | 2016-10-18 09:45:32 +0000 |
commit | 6474f98a6a8e78c3d89e4db93668cce40910c7fc (patch) | |
tree | eddc0ad2570b4860957dd4c9dd404292c10d6ee7 | |
parent | 05800170af0e10e78712ff9dfc62940eb42baa97 (diff) |
Recheck health of a node before changing its status.
send/recv() errors just give us a hint about something going wrong with a node.
But a mere send/recv failure does not mean that the node is down or
unreachable. So before changing the health status, ping the node once and
confirm its health status.
-rw-r--r-- | src/backend/optimizer/util/pathnode.c | 2 | ||||
-rw-r--r-- | src/backend/optimizer/util/pgxcship.c | 2 | ||||
-rw-r--r-- | src/backend/pgxc/pool/pgxcnode.c | 35 | ||||
-rw-r--r-- | src/backend/pgxc/pool/poolmgr.c | 47 | ||||
-rw-r--r-- | src/include/pgxc/poolmgr.h | 1 |
5 files changed, 64 insertions, 23 deletions
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 5f07ac0611..d42723455a 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -894,7 +894,7 @@ retry_pools: int i; bool healthmap[MaxDataNodes]; - PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap); + PgxcNodeDnListHealth(rel_loc_info->nodeList, healthmap); i = 0; foreach(lc, rel_loc_info->nodeList) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index fc1d189c9b..2e4e940413 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -492,7 +492,7 @@ retry_pools: ListCell *lc; bool healthmap[MaxDataNodes]; - PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap); + PgxcNodeDnListHealth(rel_loc_info->nodeList, healthmap); i = 0; foreach(lc, rel_loc_info->nodeList) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 05efd4e675..6882d59780 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -548,17 +548,14 @@ retry: /* * before returning, also update the shared health * status field to indicate that this node could be - * possibly unavailable + * possibly unavailable. + * + * Note that this error could be due to a stale handle + * and it's possible that another backend might have + * already updated the health status OR the node + * might have already come back since the last disruption */ - if (!PgxcNodeUpdateHealth(conn->nodeoid, false)) - elog(WARNING, "Could not update health status of node %u", - conn->nodeoid); - else - elog(WARNING, "Health map updated to reflect DOWN node (%u)", - conn->nodeoid); - - /* But ping once to see if the node is still available */ - PoolPingNodes(); + PoolPingNodeRecheck(conn->nodeoid); /* Should we read from the other connections before returning? */ return ERROR_OCCURED; @@ -1631,17 +1628,15 @@ pgxc_node_flush(PGXCNodeHandle *handle) /* * before returning, also update the shared health - * status field to indicate that this node is down + * status field to indicate that this node could be + * possibly unavailable. + * + * Note that this error could be due to a stale handle + * and it's possible that another backend might have + * already updated the health status OR the node + * might have already come back since the last disruption */ - if (!PgxcNodeUpdateHealth(handle->nodeoid, false)) - elog(WARNING, "Could not update health status of node %u", - handle->nodeoid); - else - elog(WARNING, "Health map updated to reflect DOWN node (%u)", - handle->nodeoid); - - /* But ping once to see if the node is still available */ - PoolPingNodes(); + PoolPingNodeRecheck(handle->nodeoid); return EOF; } } diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index fb07018500..3dac376f05 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -755,7 +755,7 @@ TryPingUnhealthyNode(Oid nodeoid) { int status; NodeDefinition *nodeDef; - char connstr[1024]; + char connstr[MAXPGPATH * 2 + 256]; nodeDef = PgxcNodeGetDefinition(nodeoid); if (nodeDef == NULL) @@ -798,6 +798,51 @@ TryPingUnhealthyNode(Oid nodeoid) } /* + * Check if a node is indeed down and if it is update its UNHEALTHY + * status + */ +void +PoolPingNodeRecheck(Oid nodeoid) +{ + int status; + NodeDefinition *nodeDef; + char connstr[MAXPGPATH * 2 + 256]; + bool healthy; + + nodeDef = PgxcNodeGetDefinition(nodeoid); + if (nodeDef == NULL) + { + /* No such definition, node dropped? */ + elog(DEBUG1, "Could not find node (%u) definition," + " skipping health check", nodeoid); + return; + } + + sprintf(connstr, + "host=%s port=%d", NameStr(nodeDef->nodehost), + nodeDef->nodeport); + status = PGXCNodePing(connstr); + healthy = (status == 0); + + /* if no change in health bit, return */ + if (healthy == nodeDef->nodeishealthy) + { + pfree(nodeDef); + return; + } + + if (!PgxcNodeUpdateHealth(nodeoid, healthy)) + elog(WARNING, "Could not update health status of node (%s)", + NameStr(nodeDef->nodename)); + else + elog(LOG, "Health map updated to reflect (%s) node (%s)", + healthy ? "HEALTHY" : "UNHEALTHY", NameStr(nodeDef->nodename)); + pfree(nodeDef); + + return; +} + +/* * Ping UNHEALTHY nodes as part of the maintenance window */ void diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 38d45cf255..e9f5bbd189 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -142,6 +142,7 @@ extern void PoolManagerLock(bool is_lock); /* Do pool health check activity */ extern void PoolPingNodes(void); +extern void PoolPingNodeRecheck(Oid nodeoid); extern bool check_persistent_connections(bool *newval, void **extra, GucSource source); |