Introduce a healthmap for tracking health status of all other nodes in the

cluster. Each node now maintains a healthmap in shared memory to keep track of availability of all other nodes in the cluster. Whenever a send() or a receive() call on a socket fails, we try to ping the node once and if that fails, we mark the node as UNHEALTHY. On the other hand, if later a new connection is established successfully to a node, we mark the node as HEALTHY. We also periodically try to ping UNHEALTHY nodes to see if they have come back to life and update the healthmap accordingly. The only consumer of a healthmap right now is SELECT queries on replicated tables. When a table is replicated to more than one node, we now consult the healthmap and discards nodes that are known to be UNHEALTHY. If all nodes are found to be UNHEALTHY, one attempt is made to see if any of them have come back online.
author: Pavan Deolasee 2016-01-14 18:26:06 +0000
committer: Pavan Deolasee 2016-10-18 09:41:41 +0000
commit: 396db0c198ac08894654f6575715a55c318a3aec (patch)
tree: d9005752b01d1cfad73f9e1ca2b30ec2d2783d07
parent: 0786b8439e48734785f540f88f600ab09cdb805f (diff)
11 files changed, 480 insertions, 10 deletions
diff --git a/src/backend/optimizer/path/pgxcpath.c b/src/backend/optimizer/path/pgxcpath.c
index 7f62d6d64a..ff1ed1e4e8 100644
--- a/src/backend/optimizer/path/pgxcpath.c
+++ b/src/backend/optimizer/path/pgxcpath.c
@@ -112,6 +112,7 @@ create_plainrel_rqpath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
 	List			*quals;
 	ExecNodes		*exec_nodes;
+	RelationLocInfo *rel_loc_info;
 
 	/*
 	 * If we are on the Coordinator, we always want to use
@@ -122,7 +123,8 @@ create_plainrel_rqpath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 		return false;
 
 	quals = extract_actual_clauses(rel->baserestrictinfo, false);
-	exec_nodes = GetRelationNodesByQuals(rte->relid, rel->relid,
+	rel_loc_info = GetRelationLocInfo(rte->relid);
+	exec_nodes = GetRelationNodesByQuals(rte->relid, rel_loc_info, rel->relid,
 														(Node *)quals,
 														RELATION_ACCESS_READ);
 	if (!exec_nodes)
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 027d28e261..5f07ac0611 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -67,6 +67,7 @@ static Path *redistribute_path(Path *subpath, char distributionType,
 				  Node* distributionExpr);
 static void set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode);
 static List *set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode);
+extern void PoolPingNodes(void);
 #endif
 
 /*****************************************************************************
@@ -878,11 +879,57 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode)
 	if (rel_loc_info)
 	{
 		ListCell *lc;
+		bool retry = true;
 		Distribution *distribution = makeNode(Distribution);
 		distribution->distributionType = rel_loc_info->locatorType;
-		foreach(lc, rel_loc_info->nodeList)
-			distribution->nodes = bms_add_member(distribution->nodes,
-												 lfirst_int(lc));
+		/*
+		 * for LOCATOR_TYPE_REPLICATED distribution, check if
+		 * all of the mentioned nodes are hale and hearty. Remove
+		 * those which are not. Do this only for SELECT queries!
+		 */
+retry_pools:
+		if (root->parse->commandType == CMD_SELECT &&
+				distribution->distributionType == LOCATOR_TYPE_REPLICATED)
+		{
+			int i;
+			bool healthmap[MaxDataNodes];
+
+			PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap);
+
+			i = 0;
+			foreach(lc, rel_loc_info->nodeList)
+			{
+				if (healthmap[i++] == true)
+					distribution->nodes = bms_add_member(distribution->nodes,
+														 lfirst_int(lc));
+			}
+
+			if (bms_is_empty(distribution->nodes))
+			{
+				/*
+				 * Try an on-demand pool maintenance just to see if some nodes
+				 * have come back.
+				 *
+				 * Try once and error out if datanodes are still down
+				 */
+				if (retry)
+				{
+					PoolPingNodes();
+					retry = false;
+					goto retry_pools;
+				}
+				else
+					elog(ERROR,
+						 "Could not find healthy nodes for replicated table. Exiting!");
+			}
+		}
+		else
+		{
+			foreach(lc, rel_loc_info->nodeList)
+				distribution->nodes = bms_add_member(distribution->nodes,
+													 lfirst_int(lc));
+		}
+
 		distribution->restrictNodes = NULL;
 		/*
 		 * Distribution expression of the base relation is Var representing
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index e463160326..fc1d189c9b 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -36,6 +36,7 @@
 #include "parser/parse_type.h"
 #include "pgxc/locator.h"
 #include "pgxc/pgxcnode.h"
+#include "pgxc/nodemgr.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 
@@ -100,6 +101,8 @@ typedef enum
 	SS_UPDATES_DISTRIBUTION_COLUMN	/* query updates the distribution column */
 } ShippabilityStat;
 
+extern void PoolPingNodes(void);
+
 /* Manipulation of shippability reason */
 static bool pgxc_test_shippability_reason(Shippability_context *context,
 										  ShippabilityStat reason);
@@ -437,6 +440,7 @@ pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
 	ExecNodes	*rel_exec_nodes;
 	RelationAccessType rel_access = RELATION_ACCESS_READ;
 	RelationLocInfo *rel_loc_info;
+	bool retry = true;
 
 	Assert(rte == rt_fetch(varno, (query->rtable)));
 
@@ -471,13 +475,62 @@ pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
 
 	/*
 	 * Find out the datanodes to execute this query on.
+	 * But first if it's a replicated table, identify and remove
+	 * unhealthy nodes from the rel_loc_info. Only for SELECTs!
+	 *
 	 * PGXC_FQS_TODO: for now, we apply node reduction only when there is only
 	 * one relation involved in the query. If there are multiple distributed
 	 * tables in the query and we apply node reduction here, we may fail to ship
 	 * the entire join. We should apply node reduction transitively.
 	 */
+retry_pools:
+	if (command_type == CMD_SELECT &&
+			rel_loc_info->locatorType == LOCATOR_TYPE_REPLICATED)
+	{
+		int i;
+		List *newlist = NIL;
+		ListCell *lc;
+		bool healthmap[MaxDataNodes];
+
+	   	PgxcNodeDnListHealth(rel_loc_info->nodeList, &healthmap);
+
+		i = 0;
+		foreach(lc, rel_loc_info->nodeList)
+		{
+			if (!healthmap[i++])
+				newlist = lappend_int(newlist, lfirst_int(lc));
+		}
+
+		if (newlist != NIL)
+			rel_loc_info->nodeList = list_difference_int(rel_loc_info->nodeList,
+													 newlist);
+		/*
+		 * If all nodes are down, cannot do much, just return NULL here
+		 */
+		if (rel_loc_info->nodeList == NIL)
+		{
+			/*
+			 * Try an on-demand pool maintenance just to see if some nodes
+			 * have come back.
+			 *
+			 * Try once and error out if datanodes are still down
+			 */
+			if (retry)
+			{
+				rel_loc_info->nodeList = newlist;
+				newlist = NIL;
+				PoolPingNodes();
+				retry = false;
+				goto retry_pools;
+			}
+			else
+				elog(ERROR,
+				 "Could not find healthy datanodes for replicated table. Exiting!");
+			return NULL;
+		}
+	}
 	if (list_length(query->rtable) == 1)
-		rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno,
+		rel_exec_nodes = GetRelationNodesByQuals(rte->relid, rel_loc_info, varno,
 												 query->jointree->quals, rel_access);
 	else
 		rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0,
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 6a4fd155b8..e9c6cb8273 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -1646,10 +1646,9 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
  * is made to see if that's correct.
  */
 ExecNodes *
-GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals,
-						RelationAccessType relaccess)
+GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
+			Index varno, Node *quals, RelationAccessType relaccess)
 {
-	RelationLocInfo *rel_loc_info = GetRelationLocInfo(reloid);
 	Expr			*distcol_expr = NULL;
 	ExecNodes		*exec_nodes;
 	Datum			distcol_value;
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index d8f8dc006c..3d05921c42 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -65,6 +65,8 @@ void
 NodeTablesShmemInit(void)
 {
 	bool found;
+	int i;
+
 	/*
 	 * Initialize the table of Coordinators: first sizeof(int) bytes are to
 	 * store actual number of Coordinators, remaining data in the structure is
@@ -83,7 +85,12 @@ NodeTablesShmemInit(void)
 
 	/* Mark it empty upon creation */
 	if (!found)
+	{
 		*shmemNumCoords = 0;
+		/* Mark nodeishealthy true at init time for all */
+		for (i = 0; i < MaxCoords; i++)
+			coDefs[i].nodeishealthy = true;
+	}
 
 	/* Same for Datanodes */
 	shmemNumDataNodes = ShmemInitStruct("Datanode Table",
@@ -91,12 +98,17 @@ NodeTablesShmemInit(void)
 									   sizeof(NodeDefinition) * MaxDataNodes,
 								   &found);
 
-	/* Have coDefs pointing right behind shmemNumDataNodes */
+	/* Have dnDefs pointing right behind shmemNumDataNodes */
 	dnDefs = (NodeDefinition *) (shmemNumDataNodes + 1);
 
 	/* Mark it empty upon creation */
 	if (!found)
+	{
 		*shmemNumDataNodes = 0;
+		/* Mark nodeishealthy true at init time for all */
+		for (i = 0; i < MaxDataNodes; i++)
+			dnDefs[i].nodeishealthy = true;
+	}
 }
 
 
@@ -314,9 +326,39 @@ PgxcNodeListAndCount(void)
 	Relation rel;
 	HeapScanDesc scan;
 	HeapTuple   tuple;
+	NodeHealthStatus *nodehealth;
+	int	numNodes = 0;
 
 	LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
 
+	/*
+	 * Save the existing health status values because nodes
+	 * might get added or deleted here. We will save
+	 * nodeoid, status. No need to differentiate between
+	 * coords and datanodes since oids will be unique anyways
+	 */
+	if (*shmemNumDataNodes != 0 || *shmemNumCoords != 0)
+	{
+		int i, j;
+
+		numNodes = *shmemNumCoords + *shmemNumDataNodes;
+		nodehealth = palloc0(
+					numNodes * sizeof(NodeHealthStatus));
+
+		for (i = 0; i < *shmemNumCoords; i++)
+		{
+			nodehealth[i].nodeoid = coDefs[i].nodeoid;
+			nodehealth[i].nodeishealthy = coDefs[i].nodeishealthy;
+		}
+
+		j = i;
+		for (i = 0; i < *shmemNumDataNodes; i++)
+		{
+			nodehealth[j].nodeoid = dnDefs[i].nodeoid;
+			nodehealth[j++].nodeishealthy = dnDefs[i].nodeishealthy;
+		}
+	}
+
 	*shmemNumCoords = 0;
 	*shmemNumDataNodes = 0;
 
@@ -333,6 +375,7 @@ PgxcNodeListAndCount(void)
 	{
 		Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 		NodeDefinition *node;
+		int i;
 
 		/* Take definition for given node type */
 		switch (nodeForm->node_type)
@@ -353,6 +396,20 @@ PgxcNodeListAndCount(void)
 		node->nodeport = nodeForm->node_port;
 		node->nodeisprimary = nodeForm->nodeis_primary;
 		node->nodeispreferred = nodeForm->nodeis_preferred;
+		/*
+		 * Copy over the health status from above for nodes that
+		 * existed before and after the refresh. If we do not find
+		 * entry for a nodeoid, we mark it as healthy
+		 */
+		node->nodeishealthy = true;
+		for (i = 0; i < numNodes; i++)
+		{
+			if (nodehealth[i].nodeoid == node->nodeoid)
+			{
+				node->nodeishealthy = nodehealth[i].nodeishealthy;
+				break;
+			}
+		}
 	}
 	heap_endscan(scan);
 	heap_close(rel, AccessShareLock);
@@ -360,6 +417,9 @@ PgxcNodeListAndCount(void)
 	elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes",
 			*shmemNumCoords, *shmemNumDataNodes);
 
+	if (numNodes)
+		pfree(nodehealth);
+
 	/* Finally sort the lists */
 	if (*shmemNumCoords > 1)
 		qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes);
@@ -436,6 +496,92 @@ PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
 	LWLockRelease(NodeTableLock);
 }
 
+/*
+ * PgxcNodeGetHealthMap
+ *
+ * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
+ * presented in the node table, as well as number of Coordinators and Datanodes.
+ * Any parameter may be NULL if caller is not interested in receiving
+ * appropriate results for either the Coordinators or Datanodes.
+ */
+void
+PgxcNodeGetHealthMap(Oid *coOids, Oid *dnOids,
+				int *num_coords, int *num_dns, bool *coHealthMap,
+				bool *dnHealthMap)
+{
+	elog(DEBUG1, "Get HealthMap from table: %d coordinators and %d datanodes",
+			*shmemNumCoords, *shmemNumDataNodes);
+
+	LWLockAcquire(NodeTableLock, LW_SHARED);
+
+	if (num_coords)
+		*num_coords = *shmemNumCoords;
+	if (num_dns)
+		*num_dns = *shmemNumDataNodes;
+
+	if (coOids)
+	{
+		int i;
+		for (i = 0; i < *shmemNumCoords; i++)
+		{
+			coOids[i] = coDefs[i].nodeoid;
+			if (coHealthMap)
+				coHealthMap[i] = coDefs[i].nodeishealthy;
+		}
+	}
+
+	if (dnOids)
+	{
+		int i;
+
+		for (i = 0; i < *shmemNumDataNodes; i++)
+		{
+			dnOids[i] = dnDefs[i].nodeoid;
+			if (dnHealthMap)
+				dnHealthMap[i] = dnDefs[i].nodeishealthy;
+		}
+	}
+
+	LWLockRelease(NodeTableLock);
+}
+
+/*
+ * Consult the shared memory NodeDefinition structures and
+ * fetch the nodeishealthy value and return it back
+ *
+ * We will probably need a similar function for coordinators
+ * in the future..
+ */
+void
+PgxcNodeDnListHealth(List *nodeList, bool *healthmap)
+{
+	ListCell *lc;
+	int index = 0;
+
+	elog(DEBUG1, "Get healthmap from datanodeList");
+
+	if (!nodeList || !list_length(nodeList))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("NIL or empty nodeList passed")));
+
+	LWLockAcquire(NodeTableLock, LW_SHARED);
+	foreach(lc, nodeList)
+	{
+		int node = lfirst_int(lc);
+
+		if (node >= *shmemNumDataNodes)
+		{
+			LWLockRelease(NodeTableLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					 errmsg("PGXC health status not found for datanode with oid (%d)",
+						 node)));
+		}
+		healthmap[index++] = dnDefs[node].nodeishealthy;
+	}
+	LWLockRelease(NodeTableLock);
+}
 
 /*
  * Find node definition in the shared memory node table.
@@ -484,6 +630,51 @@ PgxcNodeGetDefinition(Oid node)
 	return NULL;
 }
 
+/*
+ * Update health status of a node in the shared memory node table.
+ *
+ * We could try to optimize this by checking if the ishealthy value
+ * is already the same as the passed in one.. but if the cluster is
+ * impaired, dunno how much such optimizations are worth. So keeping
+ * it simple for now
+ */
+bool
+PgxcNodeUpdateHealth(Oid node, bool status)
+{
+	int				i;
+
+	LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
+
+	/* search through the Datanodes first */
+	for (i = 0; i < *shmemNumDataNodes; i++)
+	{
+		if (dnDefs[i].nodeoid == node)
+		{
+			dnDefs[i].nodeishealthy = status;
+
+			LWLockRelease(NodeTableLock);
+
+			return true;
+		}
+	}
+
+	/* if not found, search through the Coordinators */
+	for (i = 0; i < *shmemNumCoords; i++)
+	{
+		if (coDefs[i].nodeoid == node)
+		{
+			coDefs[i].nodeishealthy = status;
+
+			LWLockRelease(NodeTableLock);
+
+			return true;
+		}
+	}
+
+	/* not found, return false */
+	LWLockRelease(NodeTableLock);
+	return false;
+}
 
 /*
  * PgxcNodeCreate
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 73ac6bb4af..05efd4e675 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -293,6 +293,19 @@ PGXCNodeConnect(char *connstr)
 	return (NODE_CONNECTION *) conn;
 }
 
+int PGXCNodePing(const char *connstr)
+{
+	if (connstr[0])
+	{
+		PGPing status = PQping(connstr);
+		if (status == PQPING_OK)
+			return 0;
+		else
+			return 1;
+	}
+	else
+		return -1;
+}
 
 /*
  * Close specified connection
@@ -531,6 +544,22 @@ retry:
 					conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
 					add_error_message(conn, "unexpected EOF on datanode connection.");
 					elog(WARNING, "unexpected EOF on datanode oid connection: %d", conn->nodeoid);
+
+					/*
+					 * before returning, also update the shared health
+					 * status field to indicate that this node could be
+					 * possibly unavailable
+					 */
+					if (!PgxcNodeUpdateHealth(conn->nodeoid, false))
+						elog(WARNING, "Could not update health status of node %u",
+							 conn->nodeoid);
+					else
+						elog(WARNING, "Health map updated to reflect DOWN node (%u)",
+							 conn->nodeoid);
+
+					/* But ping once to see if the node is still available */
+					PoolPingNodes();
+
 					/* Should we read from the other connections before returning? */
 					return ERROR_OCCURED;
 				}
@@ -1599,6 +1628,20 @@ pgxc_node_flush(PGXCNodeHandle *handle)
 		if (send_some(handle, handle->outEnd) < 0)
 		{
 			add_error_message(handle, "failed to send data to datanode");
+
+			/*
+			 * before returning, also update the shared health
+			 * status field to indicate that this node is down
+			 */
+			if (!PgxcNodeUpdateHealth(handle->nodeoid, false))
+				elog(WARNING, "Could not update health status of node %u",
+					 handle->nodeoid);
+			else
+				elog(WARNING, "Health map updated to reflect DOWN node (%u)",
+					 handle->nodeoid);
+
+			/* But ping once to see if the node is still available */
+			PoolPingNodes();
 			return EOF;
 		}
 	}
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index c61f421142..30dd985b2a 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -158,6 +158,8 @@ static void PoolManagerConnect(const char *database, const char *user_name,
 static void pooler_sighup(SIGNAL_ARGS);
 static bool shrink_pool(DatabasePool *pool);
 static void pools_maintenance(void);
+static void TryPingUnhealthyNode(Oid nodeoid);
+
 /*
  * Flags set by interrupt handlers for later service in the main loop.
  */
@@ -744,6 +746,97 @@ agent_destroy(PoolAgent *agent)
 	}
 }
 
+/*
+ * Ping an UNHEALTHY node and if it succeeds, update SHARED node
+ * information
+ */
+static void
+TryPingUnhealthyNode(Oid nodeoid)
+{
+	int status;
+	NodeDefinition *nodeDef;
+	char connstr[1024];
+
+	nodeDef = PgxcNodeGetDefinition(nodeoid);
+	if (nodeDef == NULL)
+	{
+		/* No such definition, node dropped? */
+		elog(DEBUG1, "Could not find node (%u) definition,"
+			 " skipping health check", nodeoid);
+		return;
+	}
+	if (nodeDef->nodeishealthy)
+	{
+		/* hmm, can this happen? */
+		elog(DEBUG1, "node (%u) healthy!"
+			 " skipping health check", nodeoid);
+		return;
+	}
+
+	elog(LOG, "node (%s:%u) down! Trying ping",
+		 NameStr(nodeDef->nodename), nodeoid);
+	sprintf(connstr,
+			"host=%s port=%d", NameStr(nodeDef->nodehost),
+			nodeDef->nodeport);
+	status = PGXCNodePing(connstr);
+	if (status != 0)
+	{
+		pfree(nodeDef);
+		return;
+	}
+
+	elog(DEBUG1, "Node (%s) back online!", NameStr(nodeDef->nodename));
+	if (!PgxcNodeUpdateHealth(nodeoid, true))
+		elog(WARNING, "Could not update health status of node (%s)",
+			 NameStr(nodeDef->nodename));
+	else
+		elog(LOG, "Health map updated to reflect HEALTHY node (%s)",
+			 NameStr(nodeDef->nodename));
+	pfree(nodeDef);
+
+	return;
+}
+
+/*
+ * Ping UNHEALTHY nodes as part of the maintenance window
+ */
+void
+PoolPingNodes()
+{
+	Oid				coOids[MaxCoords];
+	Oid				dnOids[MaxDataNodes];
+	bool			coHealthMap[MaxCoords];
+	bool			dnHealthMap[MaxDataNodes];
+	int				numCo;
+	int				numDn;
+	int				i;
+
+	PgxcNodeGetHealthMap(coOids, dnOids, &numCo, &numDn,
+						 coHealthMap, dnHealthMap);
+
+	/*
+	 * Find unhealthy datanodes and try to re-ping them
+	 */
+	for (i = 0; i < numDn; i++)
+	{
+		if (!dnHealthMap[i])
+		{
+			Oid	 nodeoid = dnOids[i];
+			TryPingUnhealthyNode(nodeoid);
+		}
+	}
+	/*
+	 * Find unhealthy coordinators and try to re-ping them
+	 */
+	for (i = 0; i < numCo; i++)
+	{
+		if (!coHealthMap[i])
+		{
+			Oid	 nodeoid = coOids[i];
+			TryPingUnhealthyNode(nodeoid);
+		}
+	}
+}
 
 /*
  * Release handle to pool manager
@@ -1205,6 +1298,19 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
 				/* First update all the pools */
 				reload_database_pools(agent);
 				break;
+			case 'P':			/* Ping connection info */
+				/*
+				 * Ping unhealthy nodes in the pools. If any of the
+				 * nodes come up, update SHARED memory to
+				 * indicate the same.
+				 */
+				pool_getmessage(&agent->port, s, 4);
+				pq_getmsgend(s);
+
+				/* Ping all the pools */
+				PoolPingNodes();
+
+				break;
 			case 'q':			/* Check connection info consistency */
 				pool_getmessage(&agent->port, s, 4);
 				pq_getmsgend(s);
@@ -1866,8 +1972,21 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 	}
 
 	if (slot == NULL)
+	{
 		elog(WARNING, "can not connect to node %u", node);
 
+		/*
+		 * before returning, also update the shared health
+		 * status field to indicate that this node is down
+		 */
+		if (!PgxcNodeUpdateHealth(node, false))
+			elog(WARNING, "Could not update health status of node %u", node);
+		else
+			elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
+	}
+	else
+		PgxcNodeUpdateHealth(node, true);
+
 	return slot;
 }
 
@@ -2067,7 +2186,6 @@ PoolerLoop(void)
 	time_t			last_maintenance = (time_t) 0;
 	int				maintenance_timeout;
 	struct pollfd	*pool_fd;
-	int i;
 
 #ifdef HAVE_UNIX_SOCKETS
 	if (Unix_socket_directories)
@@ -2244,6 +2362,7 @@ PoolerLoop(void)
 		{
 			/* maintenance timeout */
 			pools_maintenance();
+			PoolPingNodes();
 			last_maintenance = time(NULL);
 		}
 	}
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index c0fa7fc0c4..a5501ce0d4 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -169,6 +169,7 @@ extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info,
 								   bool isValueNull,
 								   RelationAccessType accessType);
 extern ExecNodes *GetRelationNodesByQuals(Oid reloid,
+										  RelationLocInfo *rel_loc_info,
 										  Index varno,
 										  Node *quals,
 										  RelationAccessType relaccess);
diff --git a/src/include/pgxc/nodemgr.h b/src/include/pgxc/nodemgr.h
index 33c20d2c85..976384a4e8 100644
--- a/src/include/pgxc/nodemgr.h
+++ b/src/include/pgxc/nodemgr.h
@@ -38,8 +38,15 @@ typedef struct
 	int			nodeport;
 	bool		nodeisprimary;
 	bool 		nodeispreferred;
+	bool		nodeishealthy;
 } NodeDefinition;
 
+typedef struct
+{
+	Oid			nodeoid;
+	bool		nodeishealthy;
+} NodeHealthStatus;
+
 extern void NodeTablesShmemInit(void);
 extern Size NodeTablesShmemSize(void);
 
@@ -47,9 +54,14 @@ extern void PgxcNodeListAndCount(void);
 extern void PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
 							int *num_coords, int *num_dns,
 							bool update_preferred);
+extern void PgxcNodeGetHealthMap(Oid *coOids, Oid *dnOids,
+				int *num_coords, int *num_dns, bool *coHealthMap,
+				bool *dnHealthMap);
 extern NodeDefinition *PgxcNodeGetDefinition(Oid node);
 extern void PgxcNodeAlter(AlterNodeStmt *stmt);
 extern void PgxcNodeCreate(CreateNodeStmt *stmt);
 extern void PgxcNodeRemove(DropNodeStmt *stmt);
+extern void PgxcNodeDnListHealth(List *nodeList, bool *dnhealth);
+extern bool PgxcNodeUpdateHealth(Oid node, bool status);
 
 #endif	/* NODEMGR_H */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 78639f949c..789d7cecbc 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -116,6 +116,7 @@ extern void PGXCNodeClose(NODE_CONNECTION * conn);
 extern int PGXCNodeConnected(NODE_CONNECTION * conn);
 extern int PGXCNodeConnClean(NODE_CONNECTION * conn);
 extern void PGXCNodeCleanAndRelease(int code, Datum arg);
+extern int PGXCNodePing(const char *connstr);
 
 extern PGXCNodeHandle *get_any_handle(List *datanodelist);
 /* Look at information cached in node handles */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 58371573cd..5f13490503 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -139,4 +139,6 @@ extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int
 /* Lock/unlock pool manager */
 extern void PoolManagerLock(bool is_lock);
 
+/* Do pool health check activity */
+extern void PoolPingNodes(void);
 #endif
author	Pavan Deolasee	2016-01-14 18:26:06 +0000
committer	Pavan Deolasee	2016-10-18 09:41:41 +0000
commit	396db0c198ac08894654f6575715a55c318a3aec (patch)
tree	d9005752b01d1cfad73f9e1ca2b30ec2d2783d07
parent	0786b8439e48734785f540f88f600ab09cdb805f (diff)