Comments and cleanup in the connection pool manager

Similarly to a39b06b0c6, this does minor cleanup in the pool manager code by removing unused functions and adding a lot of comments, both at the file level (explaining the concepts and basic API methods) and for individual functions.
author: Tomas Vondra 2017-10-22 13:00:06 +0000
committer: Tomas Vondra 2017-11-04 16:19:06 +0000
commit: d9f45c9018ec3ec1fc11e4be2be7f9728a1799b1 (patch)
tree: 0fff4f84acb8765159714ad0b404f4927fa8a9a4
parent: cca8700e364c1031eb360de3ec16eba45152e01c (diff)
4 files changed, 1460 insertions, 628 deletions
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 66b993f53b..a664cc22da 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1,9 +1,103 @@
 /*-------------------------------------------------------------------------
  *
  * pgxcnode.c
+ *	  Functions for communication with nodes through pooled connections.
  *
- *	  Functions for the Coordinator communicating with the PGXC nodes:
- *	  Datanodes and Coordinators
+ * This is mostly a backend-side counterpart to the pool manager. Each
+ * session acquires connections to remote nodes, and uses them to execute
+ * queries.
+ *
+ * Currently, we only allow a single connection to each remote node. If
+ * a query includes multiple nodes that communicate with a given remote
+ * node (e.g. Append with multiple RemoteSubquery children), then the
+ * connection may need to be buffered (see BufferConnection).
+ *
+ * Following is an overview of the basic methods for node management and
+ * communication over the handles.
+ *
+ *
+ * node handle management
+ * ----------------------
+ * get_any_handle      - acquire handle for replicated table
+ * get_handles         - acquire handles to all specified nodes
+ * get_current_handles - return already acquired handles
+ * release_handles     - release all connection (back to pool)
+ *
+ *
+ * connection functions (TODO move to poolmgr.c)
+ * --------------------
+ * PGXCNodeConnect    - open libpq connection using connection string
+ * PGXCNodePing       - ping node using connection string
+ * PGXCNodeClose      - close libpq connection
+ * PGXCNodeConnected  - verify connection status
+ * PGXCNodeConnStr    - build connection string
+ *
+ *
+ * node handle management
+ * ----------------------
+ * PGXCNodeGetNodeOid        - OID for node by index in handle array
+ * PGXCNodeGetNodeIdFromName - determine index in handle array by name
+ * PGXCNodeGetNodeId         - determine index in handle array from OID
+ *
+ *
+ * session/transaction parameters
+ * ------------------------------
+ * PGXCNodeSetParam               - add new parameter
+ * PGXCNodeResetParams            - reset (local or session) parameters
+ * PGXCNodeGetTransactionParamStr - generate SET with transaction params
+ * PGXCNodeGetSessionParamStr     - generate SET with session params
+ *
+ *
+ * low-level TCP buffer access
+ * ---------------------------
+ * pgxc_node_receive   - receive data into input buffers for connections
+ * pgxc_node_read_data - read data for one particular connection
+ * get_message         - read one complete message from a handle
+ * send_some           - send a chunk of data to remote node
+ *
+ *
+ * send higher-level messages to remote node
+ * -----------------------------------------
+ * pgxc_node_send_parse    - sends PARSE (part of extended protocol)
+ * pgxc_node_send_bind     - sends BIND (part of extended protocol)
+ * pgxc_node_send_describe - sends DESCRIBE (part of extended protocol)
+ * pgxc_node_send_execute  - sends EXECUTE (part of extended protocol)
+ * pgxc_node_send_flush    - sends FLUSH (part of extended protocol)
+ * pgxc_node_send_close    - sends close (C)
+ * pgxc_node_send_sync     - sends sync (S)
+ * pgxc_node_send_query    - simple query protocol (Q)
+ * pgxc_node_send_rollback - simple query on failed connection (Q)
+ * pgxc_node_send_query_extended - extended query protocol (PARSE, ...)
+ *
+ *
+ * XL-specific messages to remote nodes
+ * ------------------------------------
+ * pgxc_node_send_plan       - sends plan to remote node (p)
+ * pgxc_node_send_gxid       - sends GXID to remote node (g)
+ * pgxc_node_send_cmd_id     - sends CommandId to remote node (M)
+ * pgxc_node_send_snapshot   - sends snapshot to remote node (s)
+ * pgxc_node_send_timestamp  - sends timestamp to remote node (t)
+ *
+ *
+ * misc functions
+ * --------------
+ * pgxc_node_set_query  - send SET by simple protocol, wait for "ready"
+ * pgxc_node_flush      - flush all data from the output buffer
+ *
+ *
+ * XXX We should add the custom messages (gxid, snapshot, ...) to the SGML
+ * documentation describing message formats.
+ *
+ * XXX What about using simple list, instead of the arrays? Or define new
+ * structure grouping all the important parameters (buffer, size, maxsize).
+ *
+ * XXX The comments claim that dn_handles and co_handles are allocated in
+ * Transaction context, but in fact those are allocated in TopMemoryContext.
+ * Otherwise we wouldn't be able to use persistent connections, which keeps
+ * connections for the whole session.
+ *
+ * XXX The comment at pgxc_node_free mentions TopTransactionContext, so
+ * perhaps we should consider using that?
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -11,8 +105,7 @@
  * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
  *
  * IDENTIFICATION
- *	  $$
- *
+ *	  src/backend/pgxc/pool/pgxcnode.c
  *
  *-------------------------------------------------------------------------
  */
@@ -31,24 +124,28 @@
 #include <string.h>
 #include <unistd.h>
 #include <errno.h>
+
 #include "access/gtm.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/htup_details.h"
 #include "catalog/pg_type.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pgxc_node.h"
 #include "commands/prepare.h"
 #include "gtm/gtm_c.h"
+#include "miscadmin.h"
 #include "nodes/nodes.h"
-#include "pgxc/pgxcnode.h"
 #include "pgxc/execRemote.h"
-#include "catalog/pgxc_node.h"
-#include "catalog/pg_collation.h"
 #include "pgxc/locator.h"
 #include "pgxc/nodemgr.h"
+#include "pgxc/pause.h"
 #include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
 #include "pgxc/poolmgr.h"
-#include "tcop/dest.h"
+#include "storage/ipc.h"
 #include "storage/lwlock.h"
+#include "tcop/dest.h"
 #include "utils/builtins.h"
 #include "utils/elog.h"
 #include "utils/memutils.h"
@@ -57,14 +154,9 @@
 #include "utils/syscache.h"
 #include "utils/lsyscache.h"
 #include "utils/formatting.h"
+#include "utils/snapmgr.h"
 #include "utils/tqual.h"
 #include "../interfaces/libpq/libpq-fe.h"
-#ifdef XCP
-#include "miscadmin.h"
-#include "storage/ipc.h"
-#include "pgxc/pause.h"
-#include "utils/snapmgr.h"
-#endif
 
 #define CMD_ID_MSG_LEN 8
 
@@ -73,35 +165,29 @@ static int	datanode_count = 0;
 static int	coord_count = 0;
 
 /*
- * Datanode handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Datanodes.
- */
-static PGXCNodeHandle *dn_handles = NULL;
-
-/*
- * Coordinator handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Coordinators
+ * Datanode and coordinator handles (sockets obtained from the pooler),
+ * initialized in the TopMemoryContext memory context. Those connections
+ * are used during query execution to communicate wit the nodes.
+ *
+ * XXX At this point we have only a single connection to each node, and
+ * use multiplex it for multiple cursors (see BufferConnection).
  */
-static PGXCNodeHandle *co_handles = NULL;
+static PGXCNodeHandle *dn_handles = NULL;	/* datanodes */
+static PGXCNodeHandle *co_handles = NULL;	/* coordinators */
 
-/* Current size of dn_handles and co_handles */
+/* Current number of datanode and coordinator handles. */
 int			NumDataNodes;
 int 		NumCoords;
 
-
-#ifdef XCP
 volatile bool HandlesInvalidatePending = false;
 volatile bool HandlesRefreshPending = false;
 
 /*
- * Session and transaction parameters need to to be set on newly connected
- * remote nodes.
+ * Session/transaction parameters that need to to be set on new connections.
  */
 static List *session_param_list = NIL;
 static List	*local_param_list = NIL;
-static StringInfo 	session_params;
+static StringInfo	session_params;
 static StringInfo	local_params;
 
 typedef struct
@@ -114,14 +200,9 @@ typedef struct
 
 static bool DoInvalidateRemoteHandles(void);
 static bool DoRefreshRemoteHandles(void);
-#endif
 
-#ifdef XCP
 static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
 		bool global_session, int pid);
-#else
-static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
-#endif
 static void pgxc_node_free(PGXCNodeHandle *handle);
 static void pgxc_node_all_free(void);
 
@@ -130,7 +211,7 @@ static int	get_char(PGXCNodeHandle * conn, char *out);
 
 
 /*
- * Initialize PGXCNodeHandle struct
+ * Initialize empty PGXCNodeHandle struct
  */
 static void
 init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
@@ -165,17 +246,21 @@ init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
 
 
 /*
- * Allocate and initialize memory to store Datanode and Coordinator handles.
+ * InitMultinodeExecutor
+ *	  Initialize datanode and coordinator handles.
+ *
+ * Acquires list of nodes from the node manager, and initializes handle
+ * for each one.
+ *
+ * Also determines PGXCNodeId to index in the proper array of handles
+ * (co_handles or dn_handles), depending on the type of this node.
  */
 void
 InitMultinodeExecutor(bool is_force)
 {
 	int				count;
 	Oid				*coOids, *dnOids;
-#ifdef XCP
 	MemoryContext	oldcontext;
-#endif
-
 
 	/* Free all the existing information first */
 	if (is_force)
@@ -192,13 +277,11 @@ InitMultinodeExecutor(bool is_force)
 	/* Get classified list of node Oids */
 	PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
 
-#ifdef XCP
 	/*
 	 * Coordinator and datanode handles should be available during all the
 	 * session lifetime
 	 */
 	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
-#endif
 
 	/* Do proper initialization of handles */
 	if (NumDataNodes > 0)
@@ -244,6 +327,15 @@ InitMultinodeExecutor(bool is_force)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	/*
+	 * Determine index of a handle representing this node, either in the
+	 * coordinator or datanode handles, depending on the type of this
+	 * node. The index gets stored in PGXCNodeId.
+	 *
+	 * XXX It's a bit confusing that this may point either to co_handles
+	 * or dn_handles, and may easily lead to bugs when used with the
+	 * incorrect array.
+	 */
 	if (IS_PGXC_COORDINATOR)
 	{
 		for (count = 0; count < NumCoords; count++)
@@ -265,7 +357,13 @@ InitMultinodeExecutor(bool is_force)
 }
 
 /*
- * Builds up a connection string
+ * PGXCNodeConnStr
+ *	  Builds a connection string for the provided connection parameters.
+ *
+ * Aside from the usual connection parameters (host, port, ...) we also
+ * pass information about type of the parent node and remote node type.
+ *
+ * XXX Shouldn't this rather throw an ERROR instead of returning NULL?
  */
 char *
 PGXCNodeConnStr(char *host, int port, char *dbname,
@@ -278,6 +376,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
 	/*
 	 * Build up connection string
 	 * remote type can be Coordinator, Datanode or application.
+	 *
+	 * XXX What's application remote type?
 	 */
 	num = snprintf(connstr, sizeof(connstr),
 				   "host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
@@ -299,7 +399,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
 
 
 /*
- * Connect to a Datanode using a connection string
+ * PGXCNodeConnect
+ *	  Connect to a Datanode using a constructed connection string.
  */
 NODE_CONNECTION *
 PGXCNodeConnect(char *connstr)
@@ -311,7 +412,12 @@ PGXCNodeConnect(char *connstr)
 	return (NODE_CONNECTION *) conn;
 }
 
-int PGXCNodePing(const char *connstr)
+/*
+ * PGXCNodePing
+ *	  Check that a node (identified the connstring) responds correctly.
+ */
+int
+PGXCNodePing(const char *connstr)
 {
 	if (connstr[0])
 	{
@@ -326,22 +432,23 @@ int PGXCNodePing(const char *connstr)
 }
 
 /*
- * Close specified connection
+ * PGXCNodeClose
+ *	  Close connection connection.
  */
 void
 PGXCNodeClose(NODE_CONNECTION *conn)
 {
-	/* Delegate call to the pglib */
+	/* Delegate call to the libpq */
 	PQfinish((PGconn *) conn);
 }
 
 /*
- * Checks if connection active
+ * PGXCNodeConnected
+ *	  Check if the provided connection is open and valid.
  */
 int
 PGXCNodeConnected(NODE_CONNECTION *conn)
 {
-	/* Delegate call to the pglib */
 	PGconn	   *pgconn = (PGconn *) conn;
 
 	/*
@@ -352,12 +459,13 @@ PGXCNodeConnected(NODE_CONNECTION *conn)
 }
 
 
-
-/* Close the socket handle (this process' copy) and free occupied memory
+/*
+ * pgxc_node_free
+ *	  Close the socket handle (local copy) and free occupied memory.
  *
- * Note that we do not free the handle and its members. This will be
- * taken care of when the transaction ends, when TopTransactionContext
- * is destroyed in xact.c.
+ * Note that this only closes the socket, but we do not free the handle
+ * and its members. This will be taken care of when the transaction ends,
+ * when TopTransactionContext is destroyed in xact.c.
  */
 static void
 pgxc_node_free(PGXCNodeHandle *handle)
@@ -368,7 +476,8 @@ pgxc_node_free(PGXCNodeHandle *handle)
 }
 
 /*
- * Free all the node handles cached
+ * pgxc_node_all_free
+ *	  Free all the node handles cached in TopMemoryContext.
  */
 static void
 pgxc_node_all_free(void)
@@ -410,9 +519,11 @@ pgxc_node_all_free(void)
 }
 
 /*
- * Create and initialise internal structure to communicate to
- * Datanode via supplied socket descriptor.
- * Structure stores state info and I/O buffers
+ * pgxc_node_init
+ *	  Initialize the handle to communicate to node throught the socket.
+ *
+ * Stored PID of the remote backend, and of requested, sends the global
+ * session string to the remote node.
  */
 static void
 pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
@@ -435,9 +546,10 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 	handle->inEnd = 0;
 	handle->inCursor = 0;
 	handle->needSync = false;
+
 	/*
 	 * We got a new connection, set on the remote node the session parameters
-	 * if defined. The transaction parameter should be sent after BEGIN
+	 * if defined. The transaction parameter should be sent after BEGIN.
 	 */
 	if (global_session)
 	{
@@ -451,8 +563,9 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 
 
 /*
- * Wait while at least one of specified connections has data available and read
- * the data into the buffer
+ * pgxc_node_receive
+ *	  Wait while at least one of the connections has data available, and
+ * read the data into the buffer.
  */
 bool
 pgxc_node_receive(const int conn_count,
@@ -605,28 +718,10 @@ retry:
 	return NO_ERROR_OCCURED;
 }
 
-/*
- * Is there any data enqueued in the TCP input buffer waiting
- * to be read sent by the PGXC node connection
- */
-
-int
-pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
-{
-	int ret;
-	int enqueued;
-
-	if (conn->sock < 0)
-		return 0;
-	ret = ioctl(conn->sock, FIONREAD, &enqueued);
-	if (ret != 0)
-		return 0;
-
-	return enqueued;
-}
 
 /*
- * Read up incoming messages from the PGXC node connection
+ * pgxc_node_read_data
+ *	  Read incoming data from the node TCP connection.
  */
 int
 pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
@@ -769,7 +864,10 @@ retry:
 
 
 /*
- * Get one character from the connection buffer and advance cursor
+ * Get one character from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
  */
 static int
 get_char(PGXCNodeHandle * conn, char *out)
@@ -783,7 +881,12 @@ get_char(PGXCNodeHandle * conn, char *out)
 }
 
 /*
- * Read an integer from the connection buffer and advance cursor
+ * Try reading an integer from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
+ *
+ * XXX We only ever call this once with len=4, so simplify the function.
  */
 static int
 get_int(PGXCNodeHandle *conn, size_t len, int *out)
@@ -791,6 +894,10 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
 	unsigned short tmp2;
 	unsigned int tmp4;
 
+	/*
+	 * XXX This seems somewhat inconsistent with get_char(). Perhaps this
+	 * should use >= to behave in the same way?
+	 */
 	if (conn->inCursor + len > conn->inEnd)
 		return EOF;
 
@@ -817,49 +924,70 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
 
 /*
  * get_message
- * If connection has enough data read entire message from the connection buffer
- * and returns message type. Message data and data length are returned as
- * var parameters.
- * If buffer does not have enough data leaves cursor unchanged, changes
- * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
- * receive more and returns \0
+ *	  Attempt to read the whole message from the input buffer, if possible.
+ *
+ * If the entire message is in the input buffer of the connection, reads it
+ * into a buffer (len and msg parameters) and returns the message type.
+ *
+ * If the input buffer does not contain the whole message, the cursor is
+ * left unchanged, the connection status is se to DN_CONNECTION_STATE_QUERY
+ * indicating it needs to receive more data, and \0 is returned (instead of
+ * an actual message type).
+ *
  * conn - connection to read from
  * len - returned length of the data where msg is pointing to
- * msg - returns pointer to memory in the incoming buffer. The buffer probably
- * will be overwritten upon next receive, so if caller wants to refer it later
- * it should make a copy.
+ * msg - returns pointer to position in the incoming buffer
+ *
+ * The buffer probably will be overwritten upon next receive, so if caller
+ * wants to refer it later it should make a copy.
  */
 char
 get_message(PGXCNodeHandle *conn, int *len, char **msg)
 {
 	char 		msgtype;
 
+	/*
+	 * Try reading the first char (message type) and integer (message length).
+	 *
+	 * Both functions return 0 (false) in case of success, and EOF (true) in
+	 * case of failure. So we call get_char() first, and only if it succeeds
+	 * the get_int() gets called.
+	 */
 	if (get_char(conn, &msgtype) || get_int(conn, 4, len))
 	{
-		/* Successful get_char would move cursor, restore position */
+		/* Successful get_char/get_int would move cursor, restore position. */
 		conn->inCursor = conn->inStart;
 		return '\0';
 	}
 
+	/* The message length includes the length header too, so subtract it. */
 	*len -= 4;
 
+	/*
+	 * If the whole message is not in the buffer, we need to read more data.
+	 *
+	 * Reading function will discard already consumed data in the buffer till
+	 * conn->inCursor. To avoid extra/handle cycles we need to fit the whole
+	 * message (and not just a part of it) into the buffer. So let's ensure
+	 * the buffer is large enough.
+	 *
+	 * We need 1 byte for for message type, 4 bytes for message length and
+	 * the message itself (the length is currently in *len). The buffer may
+	 * already be large enough, in which case ensure_in_buffer_capacity()
+	 * will return immediately .
+	 */
 	if (conn->inCursor + *len > conn->inEnd)
 	{
-		/*
-		 * Not enough data in the buffer, we should read more.
-		 * Reading function will discard already consumed data in the buffer
-		 * till conn->inBegin. Then we want the message that is partly in the
-		 * buffer now has been read completely, to avoid extra read/handle
-		 * cycles. The space needed is 1 byte for message type, 4 bytes for
-		 * message length and message itself which size is currently in *len.
-		 * The buffer may already be large enough, in this case the function
-		 * ensure_in_buffer_capacity() will immediately return
+		/* ensure space for the whole message (including 5B header)
+		 *
+		 * FIXME Add check of the return value. Non-zero value means failure.
 		 */
 		ensure_in_buffer_capacity(5 + (size_t) *len, conn);
 		conn->inCursor = conn->inStart;
 		return '\0';
 	}
 
+	/* Great, the whole message in the buffer. */
 	*msg = conn->inBuffer + conn->inCursor;
 	conn->inCursor += *len;
 	conn->inStart = conn->inCursor;
@@ -868,8 +996,8 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg)
 
 
 /*
- * Release all Datanode and Coordinator connections
- * back to pool and release occupied memory
+ * release_handles
+ *	  Release all node connections back to pool and free the memory.
  */
 void
 release_handles(void)
@@ -887,6 +1015,7 @@ release_handles(void)
 	if (cluster_ex_lock_held)
 		return;
 
+	/* quick exit if we have no connections to release */
 	if (datanode_count == 0 && coord_count == 0)
 		return;
 
@@ -917,9 +1046,14 @@ release_handles(void)
 		}
 	}
 
+	/*
+	 * XXX Not sure why we coordinator connections are only released when on
+	 * a coordinator. Perhaps we never acquire connections to coordinators on
+	 * datanodes? Seems like a rather minor optimization anyway.
+	 */
 	if (IS_PGXC_COORDINATOR)
 	{
-		/* Collect Coordinator handles */
+		/* Free Coordinator handles */
 		for (i = 0; i < NumCoords; i++)
 		{
 			PGXCNodeHandle *handle = &co_handles[i];
@@ -943,7 +1077,10 @@ release_handles(void)
 		}
 	}
 
-	/* And finally release all the connections on pooler */
+	/*
+	 * And finally release all the connections held by this backend back
+	 * to the connection pool.
+	 */
 	PoolManagerReleaseConnections(destroy);
 
 	datanode_count = 0;
@@ -951,15 +1088,20 @@ release_handles(void)
 }
 
 /*
- * Ensure that the supplied buffer has enough capacity and if not, it's
- * extended to an appropriate size.
+ * ensure_buffer_capacity
+ *	  Ensure that the supplied buffer has at least the required capacity.
+ *
+ * currbuf  - the currently allocated buffer
+ * currsize - size of the current buffer (in bytes)
+ * bytes_needed - required capacity (in bytes)
+ *
+ * We shall return the new buffer, if allocated successfully and set newsize_p
+ * to contain the size of the repalloc-ed buffer.
  *
- * currbuf is the currently used buffer of currsize. bytes_needed is the
- * minimum size required. We shall return the new buffer, if allocated
- * successfully and set newsize_p to contain the size of the repalloced buffer.
  * If allocation fails, NULL is returned.
  *
- * The function checks for requests beyond MaxAllocSize and throw an error.
+ * The function checks for requests beyond MaxAllocSize and throws an error
+ * if the request exceeds the limit.
  */
 static char *
 ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
@@ -967,6 +1109,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
 	char	   *newbuf;
 	Size		newsize = (Size) currsize;
 
+	/* XXX Perhaps use AllocSizeIsValid instead? */
 	if (((Size) bytes_needed) >= MaxAllocSize)
 		ereport(ERROR,
 				(ENOSPC,
@@ -974,6 +1117,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
 				 errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
 						   currsize, bytes_needed)));
 
+	/* if the buffer is already large enough, we're done */
 	if (bytes_needed <= newsize)
 	{
 		*newsize_p = currsize;
@@ -1028,8 +1172,10 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
 }
 
 /*
- * Ensure specified amount of data can fit to the incoming buffer and
- * increase it if necessary
+ * ensure_in_buffer_capacity
+ *	  Ensure specified amount of data can fit to the input buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
  */
 int
 ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1047,8 +1193,10 @@ ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
 }
 
 /*
- * Ensure specified amount of data can fit to the outgoing buffer and
- * increase it if necessary
+ * ensure_out_buffer_capacity
+ *	  Ensure specified amount of data can fit to the output buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
  */
 int
 ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1067,7 +1215,8 @@ ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
 
 
 /*
- * Send specified amount of data from the outgoing buffer over the connection
+ * send_some
+ *	  Send specified amount of data from the output buffer over the handle.
  */
 int
 send_some(PGXCNodeHandle *handle, int len)
@@ -1195,11 +1344,12 @@ send_some(PGXCNodeHandle *handle, int len)
 }
 
 /*
- * Send PARSE message with specified statement down to the Datanode
+ * pgxc_node_send_parse
+ *	  Send PARSE message with specified statement down to the datanode.
  */
 int
 pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
-						const char *query, short num_params, Oid *param_types)
+					 const char *query, short num_params, Oid *param_types)
 {
 	/* statement name size (allow NULL) */
 	int			stmtLen = statement ? strlen(statement) + 1 : 1;
@@ -1283,7 +1433,8 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
 }
 
 /*
- * Send PLAN message down to the Data node
+ * pgxc_node_send_plan
+ *	  Send PLAN message down to the datanode.
  */
 int
 pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
@@ -1364,7 +1515,8 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
 }
 
 /*
- * Send BIND message down to the Datanode
+ * pgxc_node_send_bind
+ *	  Send BIND message down to the datanode.
  */
 int
 pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
@@ -1446,7 +1598,8 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
 
 
 /*
- * Send DESCRIBE message (portal or statement) down to the Datanode
+ * pgxc_node_send_describe
+ *	  Send DESCRIBE message (portal or statement) down to the datanode.
  */
 int
 pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
@@ -1494,7 +1647,8 @@ pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
 
 
 /*
- * Send CLOSE message (portal or statement) down to the Datanode
+ * pgxc_node_send_close
+ *	  Send CLOSE message (portal or statement) down to the datanode.
  */
 int
 pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
@@ -1534,7 +1688,8 @@ pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
 }
 
 /*
- * Send EXECUTE message down to the Datanode
+ * pgxc_node_send_execute
+ *	  Send EXECUTE message down to the datanode.
  */
 int
 pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
@@ -1579,7 +1734,8 @@ pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
 
 
 /*
- * Send FLUSH message down to the Datanode
+ * pgxc_node_send_flush
+ *	  Send FLUSH message down to the datanode.
  */
 int
 pgxc_node_send_flush(PGXCNodeHandle * handle)
@@ -1606,7 +1762,8 @@ pgxc_node_send_flush(PGXCNodeHandle * handle)
 
 
 /*
- * Send SYNC message down to the Datanode
+ * pgxc_node_send_sync
+ *	  Send SYNC message down to the datanode.
  */
 int
 pgxc_node_send_sync(PGXCNodeHandle * handle)
@@ -1635,7 +1792,8 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
 
 
 /*
- * Send series of Extended Query protocol messages to the data node
+ * pgxc_node_send_query_extended
+ *	  Send series of Extended Query protocol messages to the datanode.
  */
 int
 pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
@@ -1664,8 +1822,11 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
 
 
 /*
- * This method won't return until connection buffer is empty or error occurs
- * To ensure all data are on the wire before waiting for response
+ * pgxc_node_flush
+ *	  Flush all data from the output buffer of a node handle.
+ *
+ * This method won't return until connection buffer is empty or error occurs.
+ * To ensure all data are on the wire before waiting for a response.
  */
 int
 pgxc_node_flush(PGXCNodeHandle *handle)
@@ -1693,39 +1854,10 @@ pgxc_node_flush(PGXCNodeHandle *handle)
 	return 0;
 }
 
-/*
- * This method won't return until network buffer is empty or error occurs
- * To ensure all data in network buffers is read and wasted
- */
-void
-pgxc_node_flush_read(PGXCNodeHandle *handle)
-{
-	bool	is_ready;
-	int	read_result;
-
-	if (handle == NULL)
-		return;
-
-	/*
-	 * Before reading input send Sync to make sure
-	 * we will eventually receive ReadyForQuery
-	 */
-	pgxc_node_send_sync(handle);
-	while(true)
-	{
-		read_result = pgxc_node_read_data(handle, false);
-		if (read_result < 0)
-			break;
-
-		is_ready = is_data_node_ready(handle);
-		if (is_ready == true)
-			break;
-
-	}
-}
 
 /*
- * Send specified statement down to the PGXC node
+ * pgxc_node_send_query_internal
+ *	  Send the statement down to the PGXC node.
  */
 static int
 pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
@@ -1768,21 +1900,32 @@ pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
  	return pgxc_node_flush(handle);
 }
 
+/*
+ * pgxc_node_send_rollback
+ *	  Send the rollback command to the remote node.
+ *
+ * XXX The only effect of the "rollback" is that we try sending the query
+ * even on invalid/failed connections (when everything else is prohibited).
+ */
 int
 pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
 {
 	return pgxc_node_send_query_internal(handle, query, true);
 }
 
+/*
+ * pgxc_node_send_query
+ *	  Send the query to the remote node.
+ */
 int
 pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
 {
 	return pgxc_node_send_query_internal(handle, query, false);
 }
 
-
 /*
- * Send the GXID down to the PGXC node
+ * pgxc_node_send_gxid
+ *	  Send the GXID (global transaction ID) down to the remote node.
  */
 int
 pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
@@ -1812,7 +1955,8 @@ pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
 }
 
 /*
- * Send the Command ID down to the PGXC node
+ * pgxc_node_send_cmd_id
+ *	  Send the Command ID down to the remote node
  */
 int
 pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
@@ -1847,7 +1991,8 @@ pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
 }
 
 /*
- * Send the snapshot down to the PGXC node
+ * pgxc_node_send_snapshot
+ *	  Send the snapshot down to the remote node.
  */
 int
 pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
@@ -1901,7 +2046,8 @@ pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
 }
 
 /*
- * Send the timestamp down to the PGXC node
+ * pgxc_node_send_timestamp
+ *	  Send the timestamp down to the remote node
  */
 int
 pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
@@ -1947,8 +2093,9 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
 
 
 /*
- * Add another message to the list of errors to be returned back to the client
- * at the convenient time
+ * add_error_message
+ *	  Add a message to the list of errors to be returned back to the client
+ * at a convenient time.
  */
 void
 add_error_message(PGXCNodeHandle *handle, const char *message)
@@ -1964,11 +2111,17 @@ add_error_message(PGXCNodeHandle *handle, const char *message)
 		handle->error = pstrdup(message);
 }
 
+/* index of the last node returned by get_any_handled (round-robin) */
 static int load_balancer = 0;
+
 /*
- * Get one of the specified nodes to query replicated data source.
- * If session already owns one or more  of the requested connection,
- * the function returns existing one to avoid contacting pooler.
+ * get_any_handle
+ *	  Get one of the specified nodes to query replicated data source.
+ *
+ * If session already owns one or more of requested datanode connections,
+ * the function returns one of those existing ones to avoid unnecessary
+ * pooler requests.
+ *
  * Performs basic load balancing.
  */
 PGXCNodeHandle *
@@ -1998,6 +2151,7 @@ get_any_handle(List *datanodelist)
 		/* At the moment node is an index in the array, and we may need to wrap it */
 		if (node >= NumDataNodes)
 			node -= NumDataNodes;
+
 		/* See if handle is already used */
 		if (dn_handles[node].sock != NO_SOCKET)
 		{
@@ -2079,13 +2233,16 @@ get_any_handle(List *datanodelist)
 }
 
 /*
- * for specified list return array of PGXCNodeHandles
- * acquire from pool if needed.
- * the lenth of returned array is the same as of nodelist
- * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
- * The returned list should be pfree'd when no longer needed.
- * For Coordinator, do not get a connection if Coordinator list is NIL,
- * Coordinator fds is returned only if transaction uses a DDL
+ * get_handles
+ *	  Return array of node handles (PGXCNodeHandles) for requested nodes.
+ *
+ * If we don't have the handles in the pool, acquire from pool if needed.
+ *
+ * For datanodes, the specified list may be set to NIL, in which case we
+ * return handles for all datanodes.
+ *
+ * For coordinators, we do not acquire any handles when NIL list is used.
+ * Coordinator handles are needed only for transaction performing DDL.
  */
 PGXCNodeAllHandles *
 get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
@@ -2360,6 +2517,10 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 	return result;
 }
 
+/*
+ * get_current_handles
+ *	  Return currently acquired handles.
+ */
 PGXCNodeAllHandles *
 get_current_handles(void)
 {
@@ -2414,7 +2575,10 @@ get_current_handles(void)
 	return result;
 }
 
-/* Free PGXCNodeAllHandles structure */
+/*
+ * pfree_pgxc_all_handles
+ *	  Free memory allocated for the PGXCNodeAllHandles structure.
+ */
 void
 pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
 {
@@ -2433,11 +2597,14 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
 
 /*
  * PGXCNodeGetNodeId
- *		Look at the data cached for handles and return node position
- * 		If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
- *		if node type is PGXC_NODE_DATANODE look only in datanode list,
- *		if other (assume PGXC_NODE_NODE) search both, in last case return actual
- *		node type.
+ *	  Lookup index of the requested node (by OID) in the cached handles.
+ *
+ * Optionally, the node type may be restricted using the second parameter.
+ * If the type is PGXC_NODE_COORDINATOR, we only look in coordinator list.
+ * If the node is PGXC_NODE_DATANODE, we only look in datanode list.
+ *
+ * For other values (assume PGXC_NODE_NONE) we search for both node types,
+ * and then also return the actual node type in the second parameter.
  */
 int
 PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
@@ -2478,7 +2645,9 @@ PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
 
 /*
  * PGXCNodeGetNodeOid
- *		Look at the data cached for handles and return node Oid
+ *	  Look at the data cached for handles and return node Oid.
+ *
+ * XXX Unlike PGXCNodeGetNodeId, this requires node type parameter.
  */
 Oid
 PGXCNodeGetNodeOid(int nodeid, char node_type)
@@ -2504,8 +2673,7 @@ PGXCNodeGetNodeOid(int nodeid, char node_type)
 
 /*
  * pgxc_node_str
- *
- * get the name of the node
+ *	  get the name of the current node
  */
 Datum
 pgxc_node_str(PG_FUNCTION_ARGS)
@@ -2515,7 +2683,7 @@ pgxc_node_str(PG_FUNCTION_ARGS)
 
 /*
  * PGXCNodeGetNodeIdFromName
- *		Return node position in handles array
+ *	  Return position of the node (specified by name) in handles array.
  */
 int
 PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
@@ -2544,42 +2712,48 @@ PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
 	return PGXCNodeGetNodeId(nodeoid, node_type);
 }
 
+/*
+ * paramlist_delete_param
+ *	  Delete parameter with the specified name from the parameter list.
+ */
 static List *
 paramlist_delete_param(List *param_list, const char *name)
 {
-	   ListCell   *cur_item;
-	   ListCell   *prev_item;
-
-	   prev_item = NULL;
-	   cur_item = list_head(param_list);
-
-	   while (cur_item != NULL)
-	   {
-			   ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
-
-			   if (strcmp(NameStr(entry->name), name) == 0)
-			   {
-					   /* cur_item must be removed */
-					   param_list = list_delete_cell(param_list, cur_item, prev_item);
-					   pfree(entry);
-					   if (prev_item)
-							   cur_item = lnext(prev_item);
-					   else
-							   cur_item = list_head(param_list);
-			   }
-			   else
-			   {
-					   prev_item = cur_item;
-					   cur_item = lnext(prev_item);
-			   }
-	   }
-
-	   return param_list;
+	ListCell   *cur_item;
+	ListCell   *prev_item;
+
+	prev_item = NULL;
+	cur_item = list_head(param_list);
+
+	while (cur_item != NULL)
+	{
+		ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
+
+		if (strcmp(NameStr(entry->name), name) == 0)
+		{
+			/* cur_item must be removed */
+			param_list = list_delete_cell(param_list, cur_item, prev_item);
+			pfree(entry);
+			if (prev_item)
+				cur_item = lnext(prev_item);
+			else
+				cur_item = list_head(param_list);
+		}
+		else
+		{
+			prev_item = cur_item;
+			cur_item = lnext(prev_item);
+		}
+	}
+
+	return param_list;
 }
 
 /*
- * Remember new value of a session or transaction parameter, and set same
- * values on newly connected remote nodes.
+ * PGXCNodeSetParam
+ *	  Remember new value of a session/transaction parameter.
+ *
+ * We'll set this parameter value for new connections to remote nodes.
  */
 void
 PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
@@ -2617,8 +2791,9 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
 
 	/*
 	 * Special case for
-	 * 	RESET SESSION AUTHORIZATION
-	 * 	SET SESSION AUTHORIZATION TO DEFAULT
+	 *
+	 *	RESET SESSION AUTHORIZATION
+	 *	SET SESSION AUTHORIZATION TO DEFAULT
 	 *
 	 * We must also forget any SET ROLE commands since RESET SESSION
 	 * AUTHORIZATION also resets current role to session default
@@ -2636,8 +2811,8 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
 
 
 /*
- * Forget all parameter values set either for transaction or both transaction
- * and session.
+ * PGXCNodeResetParams
+ *	  Forget all transaction (or session too) parameters.
  */
 void
 PGXCNodeResetParams(bool only_local)
@@ -2662,6 +2837,10 @@ PGXCNodeResetParams(bool only_local)
 	local_params = NULL;
 }
 
+/*
+ * get_set_command
+ *	  Construct a command setting all parameters from a given list.
+ */
 static void
 get_set_command(List *param_list, StringInfo command, bool local)
 {
@@ -2687,22 +2866,29 @@ get_set_command(List *param_list, StringInfo command, bool local)
 
 
 /*
- * Returns SET commands needed to initialize remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
- * To support Distributed Session machinery coordinator should generate and
- * send a distributed session identifier to remote nodes. Generate it here.
+ * PGXCNodeGetSessionParamStr
+ *	  Returns SET commands needed to initialize remote session.
+ *
+ * The SET command may already be built and valid (in the session_params),
+ * in which case we simply return it. Otherwise we build if from session
+ * parameter list.
+ *
+ * To support "Distributed Session" machinery, the coordinator should
+ * generate and send a distributed session identifier to remote nodes.
+ * Generate it here (simply as nodename_PID).
+ *
+ * We always define a parameter with PID of the parent process (which is
+ * this backend).
  */
 char *
 PGXCNodeGetSessionParamStr(void)
 {
 	/*
-	 * If no session parameters are set and that is a coordinator we need to set
-	 * global_session anyway, even if there were no other parameters.
-	 * We do not want this string to disappear, so create it in the
-	 * TopMemoryContext. However if we add first session parameter we will need
-	 * to free the buffer and recreate it in the same context as the hash table
-	 * to avoid memory leakage.
+	 * If no session parameters are set and this is a coordinator node, we
+	 * need to set global_session anyway, even if there are no other params.
+	 *
+	 * We do not want this string to simply disappear, so create it in the
+	 * TopMemoryContext.
 	 */
 	if (session_params == NULL)
 	{
@@ -2711,7 +2897,7 @@ PGXCNodeGetSessionParamStr(void)
 		MemoryContextSwitchTo(oldcontext);
 	}
 
-	/* If the paramstr invalid build it up */
+	/* If the parameter string is empty, build it up. */
 	if (session_params->len == 0)
 	{
 		if (IS_PGXC_COORDINATOR)
@@ -2726,9 +2912,11 @@ PGXCNodeGetSessionParamStr(void)
 
 
 /*
- * Returns SET commands needed to initialize transaction on a remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
+ * PGXCNodeGetTransactionParamStr
+ *	  Returns SET commands needed to initialize transaction on a remote node.
+ *
+ * The command may already be built and valid (in local_params StringInfo), in
+ * which case we return it right away. Otherwise build it up.
  */
 char *
 PGXCNodeGetTransactionParamStr(void)
@@ -2738,7 +2926,7 @@ PGXCNodeGetTransactionParamStr(void)
 		return NULL;
 
 	/*
-	 * If the paramstr invalid build it up.
+	 * If the StringInfo is not allocated yed, do it in TopTransactionContext.
 	 */
 	if (local_params == NULL)
 	{
@@ -2746,25 +2934,30 @@ PGXCNodeGetTransactionParamStr(void)
 		local_params = makeStringInfo();
 		MemoryContextSwitchTo(oldcontext);
 	}
+
 	/*
-	 * If parameter string exists it is valid, it is truncated when parameters
-	 * are modified.
+	 * If the parameter string is empty, it was reset in PGXCNodeSetParam. So
+	 * recompute it, using the current local_param_list (we know it's not
+	 * empty, otherwise we wound't get here through the first condition).
 	 */
 	if (local_params->len == 0)
 	{
 		get_set_command(local_param_list, local_params, true);
 	}
+
 	return local_params->len == 0 ? NULL : local_params->data;
 }
 
 
 /*
- * Send down specified query, read and discard all responses until ReadyForQuery
+ * pgxc_node_set_query
+ *	  Send down specified query, discard all responses until ReadyForQuery.
  */
 void
 pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
 {
 	pgxc_node_send_query(handle, set_query);
+
 	/*
 	 * Now read responses until ReadyForQuery.
 	 * XXX We may need to handle possible errors here.
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 140907d872..3722e9e04d 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -2,34 +2,217 @@
  *
  * poolmgr.c
  *
- *	  Connection pool manager handles connections to Datanodes
+ *	  Connection pool manager handles connections to other nodes.
  *
- * The pooler runs as a separate process and is forked off from a
- * Coordinator postmaster. If the Coordinator needs a connection from a
- * Datanode, it asks for one from the pooler, which maintains separate
- * pools for each Datanode. A group of connections can be requested in
- * a single request, and the pooler returns a list of file descriptors
- * to use for the connections.
  *
- * Note the current implementation does not yet shrink the pool over time
- * as connections are idle.  Also, it does not queue requests; if a
- * connection is unavailable, it will simply fail. This should be implemented
- * one day, although there is a chance for deadlocks. For now, limiting
- * connections should be done between the application and Coordinator.
- * Still, this is useful to avoid having to re-establish connections to the
- * Datanodes all the time for multiple Coordinator backend sessions.
+ * During query execution, nodes in the cluster often need communicate
+ * with other nodes. This applies both to coordinators (which generally
+ * delegate the query execution to the datanodes) and datanodes (that
+ * may need to exchange data with other datanodes, e.g. to redistribute
+ * one side of a join).
  *
- * The term "agent" here refers to a session manager, one for each backend
- * Coordinator connection to the pooler. It will contain a list of connections
- * allocated to a session, at most one per Datanode.
+ * Opening a new connection every time would be very inefficient (and
+ * would quickly become a major bottleneck in OLTP workloads with short
+ * queries/transactions), so XL pools and reuses the connections.
  *
+ * The pool manager runs as a separate auxiliary process and is forked
+ * from the postmaster in AuxiliaryProcessMain(), similarly to other
+ * auxiliary processes (checkpointer, bgwriter, ...).
+ *
+ * When a backend needs a connection to another node, it does not open
+ * it on it's own, but instead asks the pool manager. The pool manager
+ * maintains lists of connections for other nodes, so in most cases it
+ * can quickly provide an existing connection.
+ *
+ * Backends often need multiple connections at the same time (unless the
+ * query gets pushed to just a single node), so to reduce the overhead
+ * it's also possible to request multiple connections at once. In that
+ * case the pool manager handles all of them at once, and returns file
+ * descriptors for all the nodes at once.
+ *
+ *
+ * Note: The connection requests are not queued; if a connection is not
+ * unavailable (and can't be opened right away), the request will simply
+ * fail. This should be implemented one day, although there is a chance
+ * for deadlocks. For now, limiting connections should be done between
+ * the application and the coordinator. Still, this is useful to avoid
+ * having to re-establish connections to the datanodes all the time for
+ * multiple coordinator backend sessions.
+ *
+ * XXX Well, we try to do pools_maintenance(), which closes all old idle
+ * connections. But we try to do that only once, to prevent infinite
+ * loops.
+ *
+ * The term "pool agent" here refers to a session manager, one for each
+ * backend accessing the pooler. It manages a list of connections
+ * allocated to a session, at most one per datanode.
+ *
+ *
+ * entities of the pooler
+ * ======================
+ *
+ * This section is an overview of basic entities in the connection pool
+ * implementation. With the exception of PoolManager, all the entities
+ * are represented by a struct.
+ *
+ *
+ * PoolManager
+ * -----------
+ *
+ * - The auxiliary process started by postmaster, managing all requests
+ *   from sessions (from backend processes).
+ *
+ * - Requests arrive through PoolHandle (from sessions) and responses
+ *   (back to sessions) are sent through PoolAgent.
+ *
+ * PoolHandle
+ * ----------
+ *
+ * - Connection to PoolManager from sessions, i.e. when the sessions
+ *   needs something from the pool manager (e.g. new connection), it
+ *   sends a request a request through the handle (which pretty much
+ *   represents a unix socket).
+ *
+ * - Created and initialized in the backend process.
+ *
+ * PoolAgent
+ * ---------
+ *
+ * - Represents a session in the connection pool manager process, and
+ *   associates it with a database pool.
+ *
+ * - Tracks open connections to other nodes in the cluster, so that
+ *   we can release or close them automatically if needed.
+ *
+ * DatabasePool
+ * ------------
+ *
+ * - A connection pool for a particular database/user combination, or
+ *   rather a collection of per-node connection pools, one for each
+ *   node in the cluster.
+ *
+ * PGXCNodePool
+ * ------------
+ *
+ * - A pool of connections for a particular node in the cluster, part
+ *   of a DatabasePool (i.e. for a database/user combination).
+ *
+ * PGXCNodePoolSlot
+ * ----------------
+ *
+ * - A pooled connection, tracked in PGXCNodePool.
+ *
+ *
+ * interaction with the pooler
+ * ===========================
+ *
+ * When a session needs to open connections to other nodes, this is very
+ * roughly what happens:
+ *
+ * 1) PoolManagerConnect (backend session)
+ *
+ *    Initializes connection to the pool manager process (through the
+ *    unix socket), so that the session can send messages to the pool.
+ *    The connection is represented by "pool handle".
+ *
+ *    Note: This is not called directly, but automatically from the
+ *    functions that require connection to connection pool.
+ *
+ * 2) agent_create/agent_init (pool manager)
+ * 
+ *    Accepts the connection from the session, and opens a socket used
+ *    to respond to the session (e.g. with pooled connections).
+ *
+ *    Initializes the PoolAgent responsible for managing the pooled
+ *    connections assigned to this session, and associates it with
+ *    a database pool (dbname/user combination).
+ *
+ * 3) PoolManagerGetConnections (backend session)
+ *
+ *    Sends a request to the pool manager (through the pool handle).
+ *    The pool manager handles this in handle_get_connections(), and
+ *    sends back a list of file descriptors (pooled connections).
+ *
+ * 4) PoolManagerReleaseConnections (backend session)
+ *
+ *    Sends a request to the pool manager, notifying it that the
+ *    connections can be returned to the shared connection pool (or
+ *    have to be closed, in case of error).
+ *
+ *    The pool manager handles this in agent_release_connections().
+ *
+ * 5) PoolManagerDisconnect (backend session)
+ *
+ *    Sends a 'disconnect' message to the pool manager, and resets
+ *    the pool handle to NULL (if the session needs more connections,
+ *    it'll reconnect and start from scratch).
+ *
+ *    The pool manager handles the message by calling agent_destroy(),
+ *    which releases all remaining connections associated with the
+ *    agent, and then releases all the memory.
+ *
+ *
+ * public connection pool API
+ * ==========================
+ *
+ * The previous section briefly discussed the simplest interaction with
+ * the pool manager. This section provides a more complete overview of
+ * the pooler API, with some additional functions.
+ *
+ * These functions are meant to be used from the backends, and mostly
+ * "only" send requests to the pool manager (through the socket). The
+ * pool manager then processes those requests and does all the work.
+ *
+ * The primary use case (pooling) is handled by two functions:
+ *
+ * - PoolManagerGetConnections         acquire connection from the pool
+ * - PoolManagerReleaseConnections     release pooled connections back
+ *
+ * To cancel a query or abort a transaction in a distributed database,
+ * we need to forward the cancel/abort requests to all participating
+ * connection (tracked by PoolAgent). This is done by:
+ *
+ * - PoolManagerCancelQuery            forward "query cancel"
+ * - PoolManagerAbortTransactions      forward "abort transaction"
+ *
+ * The API also includes a number of 'maintenance' functions, which are
+ * useful e.g. when changing configuration of the cluster.
+ *
+ * - PoolManagerCleanConnection        close all unused connections
+ * - PoolManagerCheckConnectionInfo    check connection consistency
+ * - PoolManagerRefreshConnectionInfo  close mismatching connections
+ * - PoolManagerReloadConnectionInfo   close all connections
+ *
+ * There's a number of additional helper functions, but those are mostly
+ * internal and marked as static.
+ *
+ *
+ * XXX Why do we even need a separate connection pool manager? Can't we
+ * simply track the connections in a shared memory, somehow? That should
+ * be fairly simple, and it would remove the need for a separate process
+ * managing requests from all backends, no?
+ *
+ * XXX Apparently there's no "max_db_connections" option, that would
+ * limit the number of connections per node (similarly to what pgbouncer
+ * does for each DB pool, by grouping all per-user connections).
+ *
+ * XXX Make POOL_CHECK_SUCCESS and POOL_CHECK_FAILED an enum.
+ *
+ * XXX Some of the functions expect two separate lists of nodes, one for
+ * datanodes and one for coordinators. Not sure why that is necessary,
+ * and it makes the code more complicated.
+ *
+ * XXX The message types are hard-coded in the various methods as magic
+ * constants (e.g. PoolManagerAbortTransactions uses 'a'). Perhaps
+ * define this somewhere in a clear manner, e.g. like a #define.
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
  *
+ *
  * IDENTIFICATION
- *	  $$
+ *	  src/backend/pgxc/pool/poolmgr.c
  *
  *-------------------------------------------------------------------------
  */
@@ -75,8 +258,7 @@ int			PoolConnKeepAlive = 600;
 int			PoolMaintenanceTimeout = 30;
 int			MaxPoolSize = 100;
 int			PoolerPort = 6667;
-
-bool			PersistentConnections = false;
+bool		PersistentConnections = false;
 
 /* Flag to tell if we are Postgres-XC pooler process */
 static bool am_pgxc_pooler = false;
@@ -89,35 +271,66 @@ typedef struct
 	int	port;
 } PGXCNodeConnectionInfo;
 
-/* Handle to the pool manager (Session's side) */
+/* Handle to the pool manager (from each session) */
 typedef struct
 {
 	/* communication channel */
 	PoolPort	port;
 } PoolHandle;
 
-/* The root memory context */
+/* The pooler root memory context */
 static MemoryContext PoolerMemoryContext = NULL;
-/*
- * Allocations of core objects: Datanode connections, upper level structures,
- * connection strings, etc.
- */
+
+/* Core objects: connections, connection strings, etc. */
 static MemoryContext PoolerCoreContext = NULL;
-/*
- * Memory to store Agents
- */
+
+/* Pool Agents */
 static MemoryContext PoolerAgentContext = NULL;
 
-/* Pool to all the databases (linked list) */
+/*
+ * A list of connection pools per (one for each db/user combination).
+ *
+ * XXX The DatabasePool are organized in a simple linked list. That may
+ * be an issue with many databases/users, so perhaps we should consider
+ * organizing this in a hash table  or something. But for now linked
+ * list is good enough.
+ */
 static DatabasePool *databasePools = NULL;
 
-/* PoolAgents and the poll array*/
+/*
+ * An array of allocated PoolAgents (one for each session).
+ *
+ * There's a 1:1 mapping between sessions and agents, so the number of
+ * agents is limited by MaxConnections. Also, we can access the agents
+ * directly using MyBackendId, so there's not much point in building a
+ * more complicated structure here (like a hash table for example).
+ *
+ * XXX That however does not happen, because agent_create() simply adds
+ * the agents at the end of the poolAgents array. So PoolerLoop and
+ * agent_destroy have to loop through the agents, etc. Seems expensive.
+ *
+ * XXX We do know that there will never be more than MaxConnections
+ * agents, so we can simply pre-allocate all of them in PoolManagerInit,
+ * and then only flag them as 'used/unused' intead of palloc/pfree.
+ */
 static int	agentCount = 0;
 static PoolAgent **poolAgents;
 
+/*
+ * A connection to the pool manager (essentially a PQ connection).
+ */
 static PoolHandle *poolHandle = NULL;
 
+/*
+ * PoolManager "lock" flag. The manager runs as a separate process, so
+ * we can use this very simple approach to locking.
+ */
 static int	is_pool_locked = false;
+
+/*
+ * File descriptor representing the pool manager UNIX socket. Sessions
+ * are communicating with the pool manager though this file descriptor.
+ */
 static int	server_fd = -1;
 
 static int	node_info_check(PoolAgent *agent);
@@ -141,10 +354,18 @@ static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
 static void agent_release_connections(PoolAgent *agent, bool force_destroy);
 static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 							   Oid node, bool force_destroy);
+
 static void destroy_slot(PGXCNodePoolSlot *slot);
-static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
 static void destroy_node_pool(PGXCNodePool *node_pool);
+
+static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
+static bool shrink_pool(DatabasePool *pool);
+static void pools_maintenance(void);
+
 static void PoolerLoop(void);
+static void PoolManagerConnect(const char *database, const char *user_name,
+		const char *pgoptions);
+
 static int clean_connection(List *node_discard,
 							const char *database,
 							const char *user_name);
@@ -153,14 +374,12 @@ static int *abort_pids(int *count,
 					   const char *database,
 					   const char *user_name);
 static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
+
 /* Signal handlers */
 static void pooler_die(SIGNAL_ARGS);
 static void pooler_quickdie(SIGNAL_ARGS);
-static void PoolManagerConnect(const char *database, const char *user_name,
-		const char *pgoptions);
 static void pooler_sighup(SIGNAL_ARGS);
-static bool shrink_pool(DatabasePool *pool);
-static void pools_maintenance(void);
+
 static void TryPingUnhealthyNode(Oid nodeoid);
 
 /*
@@ -182,7 +401,7 @@ IsPGXCPoolerProcess(void)
 }
 
 /*
- * Initialize internal structures
+ * Initialize internal PoolManager structures.
  */
 int
 PoolManagerInit()
@@ -208,7 +427,8 @@ PoolManagerInit()
 											   ALLOCSET_DEFAULT_INITSIZE,
 											   ALLOCSET_DEFAULT_MAXSIZE);
 
-	ForgetLockFiles();	
+	/* XXX Not sure what this is ... */
+	ForgetLockFiles();
 
 	/*
 	 * Properly accept or ignore signals the postmaster might send us
@@ -230,6 +450,7 @@ PoolManagerInit()
 	/* Allocate pooler structures in the Pooler context */
 	MemoryContextSwitchTo(PoolerMemoryContext);
 
+	/* Allocate pool agents, one for each connection (session). */
 	poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
 	if (poolAgents == NULL)
 	{
@@ -244,7 +465,11 @@ PoolManagerInit()
 
 
 /*
- * Check connection info consistency with system catalogs
+ * node_info_check
+ *	  Check that connection info is consistent with system catalogs.
+ *
+ * Returns POOL_CHECK_SUCCESS when all the information (number of nodes,
+ * node OIDs and connection strings) match. POOL_CHECK_FAILED otherwise.
  */
 static int
 node_info_check(PoolAgent *agent)
@@ -258,8 +483,9 @@ node_info_check(PoolAgent *agent)
 	int				numDn;
 
 	/*
-	 * First check if agent's node information matches to current content of the
-	 * shared memory table.
+	 * First check if agent's node information (number of node OIDs and
+	 * the OID values) matches the current contents of the shared memory
+	 * table (with authoritative node information).
 	 */
 	PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
 
@@ -274,8 +500,14 @@ node_info_check(PoolAgent *agent)
 	pfree(dnOids);
 
 	/*
-	 * Iterate over all dbnode pools and check if connection strings
-	 * are matching node definitions.
+	 * Iterate over all database pools and check if connection strings
+	 * (in all node pools) match node definitions from node catalog.
+	 *
+	 * XXX Does this behave correctly with multiple database pools? We
+	 * remember which nodes were already checked in a 'checked' list,
+	 * so that we check each node just once. But doesn't that mean we
+	 * only really check the first DatabasePool and fail to check the
+	 * following ones?
 	 */
 	while (res == POOL_CHECK_SUCCESS && dbPool)
 	{
@@ -287,22 +519,30 @@ node_info_check(PoolAgent *agent)
 		{
 			char 		   *connstr_chk;
 
-			/* No need to check same Datanode twice */
+			/* No need to check same node twice */
 			if (list_member_oid(checked, nodePool->nodeoid))
 				continue;
+
 			checked = lappend_oid(checked, nodePool->nodeoid);
 
 			connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
 			if (connstr_chk == NULL)
 			{
 				/* Problem of constructing connection string */
+				ereport(INFO,
+						(errmsg("failed to construct connection string for node %d",
+								nodePool->nodeoid)));
 				hash_seq_term(&hseq_status);
 				res = POOL_CHECK_FAILED;
 				break;
 			}
+
 			/* return error if there is difference */
 			if (strcmp(connstr_chk, nodePool->connstr))
 			{
+				ereport(INFO,
+						(errmsg("mismatching connection string for node %d ('%s' != '%s')",
+								nodePool->nodeoid, nodePool->connstr, connstr_chk)));
 				pfree(connstr_chk);
 				hash_seq_term(&hseq_status);
 				res = POOL_CHECK_FAILED;
@@ -313,29 +553,21 @@ node_info_check(PoolAgent *agent)
 		}
 		dbPool = dbPool->next;
 	}
+
 	list_free(checked);
 	return res;
 }
 
 /*
- * Destroy internal structures
- */
-int
-PoolManagerDestroy(void)
-{
-	int			status = 0;
-
-	if (PoolerMemoryContext)
-	{
-		MemoryContextDelete(PoolerMemoryContext);
-		PoolerMemoryContext = NULL;
-	}
-
-	return status;
-}
-
-/*
- * Connect to the pooler process
+ * GetPoolManagerHandle
+ *	  Connect to pool manager (through a UNIX socket).
+ *
+ * We know the pooler always runs on the same system (as it's just an
+ * auxiliary process forked from postmaster), so we only support UNIX
+ * sockets.
+ *
+ * XXX Perhaps this should fail at compile time when HAVE_UNIX_SOCKETS
+ * is not defined?
  */
 static void
 GetPoolManagerHandle(void)
@@ -343,8 +575,8 @@ GetPoolManagerHandle(void)
 	PoolHandle *handle;
 	int			fdsock = -1;
 
+	/* do nothing if a session is already connected to pool manager */
 	if (poolHandle)
-		/* already connected */
 		return;
 
 #ifdef HAVE_UNIX_SOCKETS
@@ -409,10 +641,11 @@ GetPoolManagerHandle(void)
 				(errmsg("failed to connect to pool manager: %m")));
 
 	/*
-	 * Allocate handle
+	 * Allocate the handle
+	 *
+	 * XXX We may change malloc to palloc here, but first ensure that
+	 * the CurrentMemoryContext is set properly.
 	 *
-	 * XXX we may change malloc here to palloc but first ensure
-	 * the CurrentMemoryContext is properly set.
 	 * The handle allocated just before new session is forked off and
 	 * inherited by the session process. It should remain valid for all
 	 * the session lifetime.
@@ -432,7 +665,12 @@ GetPoolManagerHandle(void)
 }
 
 /*
- * Create agent
+ * agent_create
+ *	  Create a PoolAgent for a new session.
+ *
+ * PoolAgent represents the session within pool manager process. So when
+ * the session wants to communicate with the pool manager, it sends the
+ * data through PoolHandle, and pool manager responds through PoolAgent.
  */
 static void
 agent_create(void)
@@ -493,21 +731,22 @@ agent_create(void)
 
 /*
  * session_options
- * Returns the pgoptions string generated using a particular
- * list of parameters that are required to be propagated to Datanodes.
- * These parameters then become default values for the pooler sessions.
+ *	  Generates a pgoptions string to propagete to the other nodes.
+ *
+ * These parameters then become default values for the pooled sessions.
  * For e.g., a psql user sets PGDATESTYLE. This value should be set
  * as the default connection parameter in the pooler session that is
- * connected to the Datanodes. There are various parameters which need to
- * be analysed individually to determine whether these should be set on
- * Datanodes.
+ * connected to the other nodes.
  *
- * Note: These parameters values are the default values of the particular
- * Coordinator backend session, and not the new values set by SET command.
+ * There are various parameters which need to be analysed individually
+ * to determine whether these should be tracked and propagated.
  *
+ * Note: These parameters values are the default values of each backend
+ * session, and not the new values set by SET command. We simply get
+ * the default value using GetConfigOptionResetString().
  */
-
-char *session_options(void)
+char *
+session_options(void)
 {
 	int				 i;
 	char			*pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
@@ -547,8 +786,20 @@ char *session_options(void)
 
 
 /*
- * Associate session with specified database and respective connection pool
- * Invoked from Session process
+ * PoolManagerConnect
+ *	  Connect session to a pool manager.
+ *
+ * Used from a backend to open a connection to the pool manager. The
+ * backends do not call this directly, though - it's called automatically
+ * from functions that need to communicate with the pool manager.
+ *
+ * Opens a communication channel by acquiring a "pool manger handle"
+ * (which opens a two-way connection through a UNIX socket), and then
+ * sends enough information (particularly dbname and username) to lookup
+ * the right connection pool.
+ *
+ * This only sends the message to the pool manager, but does not wait
+ * for response.
  */
 static void
 PoolManagerConnect(const char *database, const char *user_name,
@@ -561,7 +812,7 @@ PoolManagerConnect(const char *database, const char *user_name,
 	int		pgoptionslen = strlen(pgoptions);
 	char	atchar = ' ';
 
-	/* Connect to the pooler process if not yet connected */
+	/* Make sure we're connected to the pool manager process.*/
 	GetPoolManagerHandle();
 	if (poolHandle == NULL)
 		ereport(ERROR,
@@ -573,9 +824,10 @@ PoolManagerConnect(const char *database, const char *user_name,
 
 	/*
 	 * Special handling for db_user_namespace=on
+	 *
 	 * We need to handle per-db users and global users. The per-db users will
 	 * arrive with @dbname and global users just as username. Handle both of
-	 * them appropriately
+	 * them appropriately.
 	 */
 	if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
 	{
@@ -623,6 +875,7 @@ PoolManagerConnect(const char *database, const char *user_name,
 	}
 	else
 		pool_putbytes(&poolHandle->port, user_name, unamelen);
+
 	pool_putbytes(&poolHandle->port, "\0", 1);
 
 	/* Length of pgoptions string */
@@ -636,54 +889,11 @@ PoolManagerConnect(const char *database, const char *user_name,
 }
 
 /*
- * Reconnect to pool manager
- * It simply does a disconnection and a reconnection.
- */
-void
-PoolManagerReconnect(void)
-{
-	elog(DEBUG1, "Reconnecting to PoolManager");
-
-	/* Connected, disconnect */
-	if (poolHandle)
-		PoolManagerDisconnect();
-
-	PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
-			session_options());
-}
-
-/*
- * Lock/unlock pool manager
- * During locking, the only operations not permitted are abort, connection and
- * connection obtention.
- */
-void
-PoolManagerLock(bool is_lock)
-{
-	char msgtype = 'o';
-	int n32;
-	int msglen = 8;
-	if (poolHandle == NULL)
-		PoolManagerConnect(get_database_name(MyDatabaseId),
-						   GetClusterUserName(), "");
-
-	elog(DEBUG1, "Locking PoolManager");
-
-	/* Message type */
-	pool_putbytes(&poolHandle->port, &msgtype, 1);
-
-	/* Message length */
-	n32 = htonl(msglen);
-	pool_putbytes(&poolHandle->port, (char *) &n32, 4);
-
-	/* Lock information */
-	n32 = htonl((int) is_lock);
-	pool_putbytes(&poolHandle->port, (char *) &n32, 4);
-	pool_flush(&poolHandle->port);
-}
-
-/*
- * Init PoolAgent
+ * agent_init
+ *	  Initialize a PoolAgent instance (allocate memory, etc.).
+ *
+ * Allocates memory for coordinator and datanode connections (in the
+ * per-agent memory context), and links it to the correct database pool.
  */
 static void
 agent_init(PoolAgent *agent, const char *database, const char *user_name,
@@ -695,6 +905,9 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
 	Assert(database);
 	Assert(user_name);
 
+	elog(DEBUG1, "Initializing PoolAgent (user_name %s, database %s, "
+			"pgoptions %s", user_name, database, pgoptions);
+
 	/* disconnect if we are still connected */
 	if (agent->pool)
 		agent_release_connections(agent, false);
@@ -709,20 +922,34 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
 			palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
 	agent->dn_connections = (PGXCNodePoolSlot **)
 			palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
-	/* find database */
+
+	/* find the right database pool */
 	agent->pool = find_database_pool(database, user_name, pgoptions);
 
 	/* create if not found */
 	if (agent->pool == NULL)
 		agent->pool = create_database_pool(database, user_name, pgoptions);
 
+	Assert(agent->pool);
+
 	MemoryContextSwitchTo(oldcontext);
 
 	return;
 }
 
 /*
- * Destroy PoolAgent
+ * agent_destroy
+ *		Close remaining connections, release agent's memory.
+ *
+ * Under normal conditions, all connections managed by the agent should
+ * have been closed by this point. If there are some connections still
+ * associated with the agent, something must have gone wrong (error),
+ * in which case we have no idea in what state the connections are and
+ * we have no reliable / cheap way to find out. So just close them.
+ *
+ * XXX This is one of the places where we have to loop through the array
+ * of agents to find the "current" one. Seems expensive, especially when
+ * there are many short-lived sessions (as typical in OLTP).
  */
 static void
 agent_destroy(PoolAgent *agent)
@@ -733,17 +960,17 @@ agent_destroy(PoolAgent *agent)
 
 	close(Socket(agent->port));
 
-	/* Discard connections if any remaining */
+	/*
+	 * Release all connections the session might be still holding.
+	 * 
+	 * If the session is disconnecting while still holding some open
+	 * connections, we have no idea if those connections are clean
+	 * or not. So force destroying them.
+	 */
 	if (agent->pool)
-	{
-		/*
-		 * If session is disconnecting while there are active connections
-		 * we can not know if they clean or not, so force destroy them
-		 */
 		agent_release_connections(agent, true);
-	}
 
-	/* find agent in the list */
+	/* Remove the agent from the poolAgents array. */
 	for (i = 0; i < agentCount; i++)
 	{
 		if (poolAgents[i] == agent)
@@ -762,8 +989,13 @@ agent_destroy(PoolAgent *agent)
 }
 
 /*
- * Ping an UNHEALTHY node and if it succeeds, update SHARED node
- * information
+ * TryPingUnhealthyNode
+ *	  Try pinging a node marked as unhealthy, and update shared info.
+ *
+ * Try pinging a node previously marked as UNHEALTHY, and if it succeeds
+ * then update the SHARED node information (marking it as healthy).
+ *
+ * XXX Perhaps this should track timestamp of the last attempted ping?
  */
 static void
 TryPingUnhealthyNode(Oid nodeoid)
@@ -773,6 +1005,7 @@ TryPingUnhealthyNode(Oid nodeoid)
 	char connstr[MAXPGPATH * 2 + 256];
 
 	nodeDef = PgxcNodeGetDefinition(nodeoid);
+
 	if (nodeDef == NULL)
 	{
 		/* No such definition, node dropped? */
@@ -780,6 +1013,8 @@ TryPingUnhealthyNode(Oid nodeoid)
 			 " skipping health check", nodeoid);
 		return;
 	}
+
+	/* XXX This fails to release the nodeDef, which is a memory leak. */
 	if (nodeDef->nodeishealthy)
 	{
 		/* hmm, can this happen? */
@@ -790,9 +1025,11 @@ TryPingUnhealthyNode(Oid nodeoid)
 
 	elog(LOG, "node (%s:%u) down! Trying ping",
 		 NameStr(nodeDef->nodename), nodeoid);
+
 	sprintf(connstr,
 			"host=%s port=%d", NameStr(nodeDef->nodehost),
 			nodeDef->nodeport);
+
 	status = PGXCNodePing(connstr);
 	if (status != 0)
 	{
@@ -813,8 +1050,10 @@ TryPingUnhealthyNode(Oid nodeoid)
 }
 
 /*
- * Check if a node is indeed down and if it is update its UNHEALTHY
- * status
+ * PoolPingNodeRecheck
+ *	  Check if a node is down, and if it is then mark it as UNHEALTHY.
+ *
+ * XXX Move to pgxcnode.c (as static), it's not used anywhere else.
  */
 void
 PoolPingNodeRecheck(Oid nodeoid)
@@ -858,7 +1097,11 @@ PoolPingNodeRecheck(Oid nodeoid)
 }
 
 /*
- * Ping UNHEALTHY nodes as part of the maintenance window
+ * PoolPingNodes
+ *	  Ping nodes currently marked as UNHEALTHY.
+ *
+ * XXX Perhaps we should fetch only the unhealthy nodes, instead of
+ * fetching everything and then looping over them.
  */
 void
 PoolPingNodes()
@@ -875,7 +1118,7 @@ PoolPingNodes()
 						 coHealthMap, dnHealthMap);
 
 	/*
-	 * Find unhealthy datanodes and try to re-ping them
+	 * Find unhealthy datanodes and try to re-ping them.
 	 */
 	for (i = 0; i < numDn; i++)
 	{
@@ -885,8 +1128,9 @@ PoolPingNodes()
 			TryPingUnhealthyNode(nodeoid);
 		}
 	}
+
 	/*
-	 * Find unhealthy coordinators and try to re-ping them
+	 * Find unhealthy coordinators and try to re-ping them.
 	 */
 	for (i = 0; i < numCo; i++)
 	{
@@ -898,8 +1142,18 @@ PoolPingNodes()
 	}
 }
 
+/***********************************************************************
+ * Communication with a pool manager (sending messages through socket).
+ **********************************************************************/
+
+
 /*
- * Release handle to pool manager
+ * PoolManagerDisconnect
+ *	  Close connection to the pool manager and reset it to NULL.
+ *
+ * When everything goes well, the session notifies the pool manager by
+ * sending an exit message ('d'), closes the port and releases all
+ * memory associated with it.
  */
 void
 PoolManagerDisconnect(void)
@@ -917,7 +1171,12 @@ PoolManagerDisconnect(void)
 
 
 /*
- * Get pooled connections
+ * PoolManagerGetConnections
+ *	  Acquire connections for requested nodes, along with their PIDs.
+ *
+ * Acquires pooled connections for the specified nodes, and returns an
+ * array of file descriptors, representing connections to the nodes.
+ * It also provides array of PIDs of the backends (on remote nodes).
  */
 int *
 PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
@@ -926,20 +1185,27 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 	ListCell   *nodelist_item;
 	int		   *fds;
 	int			totlen = list_length(datanodelist) + list_length(coordlist);
-	int			nodes[totlen + 2];
+	int			nodes[totlen + 2]; /* node OIDs + two node counts */
 
+	/* Make sure we're connected to the pool manager. */
 	if (poolHandle == NULL)
 		PoolManagerConnect(get_database_name(MyDatabaseId),
 						   GetClusterUserName(), session_options());
 
 	/*
-	 * Prepare end send message to pool manager.
-	 * First with Datanode list.
-	 * This list can be NULL for a query that does not need
-	 * Datanode Connections (Sequence DDLs)
+	 * Prepare a message we send to the pool manager. We build it in the
+	 * nodes array, as all the fields are int-sized.
+	 *
+	 * - number of datanodes
+	 * - datanode OIDs
+	 * - number of coordinators
+	 * - coordinator OIDs
+	 * 
+	 * The datanode list may be empty when the query does not need talk
+	 * to datanodes (e.g. sequence DDL).
 	 */
-	nodes[0] = htonl(list_length(datanodelist));
-	i = 1;
+	i = 0;
+	nodes[i++] = htonl(list_length(datanodelist));
 	if (list_length(datanodelist) != 0)
 	{
 		foreach(nodelist_item, datanodelist)
@@ -947,7 +1213,11 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 			nodes[i++] = htonl(lfirst_int(nodelist_item));
 		}
 	}
-	/* Then with Coordinator list (can be nul) */
+
+	/*
+	 * Similarly for coordinators, some queries don't need them and in
+	 * that case the list may be NULL.
+	 */
 	nodes[i++] = htonl(list_length(coordlist));
 	if (list_length(coordlist) != 0)
 	{
@@ -957,10 +1227,14 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 		}
 	}
 
+	/*
+	 * Send the encoded datanode/coordinator OIDs to the pool manager,
+	 * flush the message nd wait for the response.
+	 */
 	pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
 	pool_flush(&poolHandle->port);
 
-	/* Receive response */
+	/* Allocate memory for file descriptors (node connections). */
 	fds = (int *) palloc(sizeof(int) * totlen);
 	if (fds == NULL)
 	{
@@ -968,14 +1242,19 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 	}
+
+	/* receive file descriptors */
 	if (pool_recvfds(&poolHandle->port, fds, totlen))
 	{
+		elog(WARNING, "failed to receive file descriptors for connections");
 		pfree(fds);
 		fds = NULL;
 	}
 
+	/* receive PIDs for remote backends */
 	if (pool_recvpids(&poolHandle->port, pids) != totlen)
 	{
+		elog(WARNING, "failed to receive PIDs of remote backends");
 		pfree(*pids);
 		*pids = NULL;
 		return NULL;
@@ -984,16 +1263,26 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 	return fds;
 }
 
+
 /*
- * Abort active transactions using pooler.
- * Take a lock forbidding access to Pooler for new transactions.
+ * PoolManagerAbortTransactions
+ *	  Abort active transactions on connections in a particular pool.
+ *
+ * Simply send an 'abort' message to the pool manager, which then aborts
+ * in-progress transaction on all connections in a matching DatabasePool
+ * (identified by dbname/username).
+ *
+ * Currently this point this only happens during CLEAN CONNECTION.
+ *
+ * An array of PIDs on which transactions were aborted is returned
+ * through the proc_pids argument, the number of PIDs as a return value.
  */
 int
 PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
 {
 	int		num_proc_ids = 0;
 	int		n32, msglen;
-	char		msgtype = 'a';
+	char	msgtype = 'a';
 	int		dblen = dbname ? strlen(dbname) + 1 : 0;
 	int		userlen = username ? strlen(username) + 1 : 0;
 
@@ -1039,10 +1328,12 @@ PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
 
 
 /*
- * Clean up Pooled connections
+ * PoolManagerCleanConnection
+ *	  Performs a cleanup of pooled connections.
  */
 void
-PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
+PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+						   char *dbname, char *username)
 {
 	int			totlen = list_length(datanodelist) + list_length(coordlist);
 	int			nodes[totlen + 2];
@@ -1052,16 +1343,25 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
 	int			userlen = username ? strlen(username) + 1 : 0;
 	int			dblen = dbname ? strlen(dbname) + 1 : 0;
 
-	/*
-	 * New connection may be established to clean connections to
-	 * specified nodes and databases.
-	 */
+	/* Make sure we're connected to the pool manager. */
 	if (poolHandle == NULL)
 		PoolManagerConnect(get_database_name(MyDatabaseId),
 						   GetClusterUserName(), session_options());
 
-	nodes[0] = htonl(list_length(datanodelist));
-	i = 1;
+	/*
+	 * Prepare a message we send to the pool manager. We build it in the
+	 * nodes array, as all the fields are int-sized.
+	 *
+	 * - number of datanodes
+	 * - datanode OIDs
+	 * - number of coordinators
+	 * - coordinator OIDs
+	 * 
+	 * The datanode list may be empty when the query does not need talk
+	 * to datanodes (e.g. sequence DDL).
+	 */
+	i = 0;
+	nodes[i++] = htonl(list_length(datanodelist));
 	if (list_length(datanodelist) != 0)
 	{
 		foreach(nodelist_item, datanodelist)
@@ -1069,7 +1369,11 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
 			nodes[i++] = htonl(lfirst_int(nodelist_item));
 		}
 	}
-	/* Then with Coordinator list (can be nul) */
+
+	/*
+	 * Similarly for coordinators, some queries don't need them and in
+	 * that case the list may be NULL.
+	 */
 	nodes[i++] = htonl(list_length(coordlist));
 	if (list_length(coordlist) != 0)
 	{
@@ -1117,21 +1421,32 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
 
 
 /*
- * Check connection information consistency cached in pooler with catalog information
+ * PoolManagerCheckConnectionInfo
+ *	  Check that pool manager info is consistent with the node catalog.
+ *
+ * Check that information used by the pool manager (for open connections)
+ * is consistent with the system catalog.
+ *
+ * Returns 'true' when everything seems consistent, and 'false' in case
+ * of some inconsistency.
  */
 bool
 PoolManagerCheckConnectionInfo(void)
 {
 	int res;
 
-	/*
-	 * New connection may be established to clean connections to
-	 * specified nodes and databases.
-	 */
+	/* Make sure we're connected to the pool manager. */
 	if (poolHandle == NULL)
 		PoolManagerConnect(get_database_name(MyDatabaseId),
 						   GetClusterUserName(), session_options());
+
+	/*
+	 * The name is a bit misleading, but PgxcNodeListAndCount updates
+	 * information about nodes in shared memory from system catalog.
+	 */
 	PgxcNodeListAndCount();
+
+	/* Send message to the pool manager and wait for a response. */
 	pool_putmessage(&poolHandle->port, 'q', NULL, 0);
 	pool_flush(&poolHandle->port);
 
@@ -1145,7 +1460,8 @@ PoolManagerCheckConnectionInfo(void)
 
 
 /*
- * Reload connection data in pooler and drop all the existing connections of pooler
+ * PoolManagerReloadConnectionInfo
+ *	  Reload connection metadata and close all open connections.
  */
 void
 PoolManagerReloadConnectionInfo(void)
@@ -1156,11 +1472,14 @@ PoolManagerReloadConnectionInfo(void)
 	pool_flush(&poolHandle->port);
 }
 
+
 /*
- * Refresh connection data in pooler and drop connections for those nodes
- * that have changed. Thus, this operation is less destructive as compared
- * to PoolManagerReloadConnectionInfo and should typically be called when
- * NODE ALTER has been performed
+ * PoolManagerRefreshConnectionInfo
+ *	  Refresh connection metadata and close stale connections.
+ *
+ * Unlike PoolManagerReloadConnectionInfo, this only closes connections
+ * to nodes where the metadata changed. Thus, this operation is less
+ * destructive, and should typically be called after NODE ALTER.
  */
 int
 PoolManagerRefreshConnectionInfo(void)
@@ -1180,6 +1499,17 @@ PoolManagerRefreshConnectionInfo(void)
 	return false;
 }
 
+
+/***********************************************************************
+ * Handling of messages sent to the pool manager (through the socket).
+ **********************************************************************/
+
+/*
+ * handle_abort
+ *	  Handles 'abort transaction' action.
+ *
+ * The message is built and sent by PoolManagerAbortTransactions.
+ */
 static void
 handle_abort(PoolAgent * agent, StringInfo s)
 {
@@ -1206,6 +1536,15 @@ handle_abort(PoolAgent * agent, StringInfo s)
 		pfree(pids);
 }
 
+/*
+ * handle_connect
+ *	  Initializes a PoolAgent object and associates is with a pool.
+ *
+ * Once the connect is complete, the agent is associated with a database
+ * pool and can provide pooled connections.
+ *
+ * The message is built and sent by PoolManagerConnect.
+ */
 static void
 handle_connect(PoolAgent * agent, StringInfo s)
 {
@@ -1226,14 +1565,19 @@ handle_connect(PoolAgent * agent, StringInfo s)
 	len = pq_getmsgint(s, 4);
 	pgoptions = pq_getmsgbytes(s, len);
 
-	/*
-	 * Coordinator pool is not initialized.
-	 * With that it would be impossible to create a Database by default.
-	 */
+	/* Initialize the agent - find the proper DatabasePool, etc. */
 	agent_init(agent, database, user_name, pgoptions);
+
+	/* XXX Shouldn't this be before the agent_init? */
 	pq_getmsgend(s);
 }
 
+/*
+ * handle_clean_connection
+ *	  Handles CLEAN CONNECTION command.
+ *
+ * The message is built and sent by PoolManagerCleanConnection.
+ */
 static void
 handle_clean_connection(PoolAgent * agent, StringInfo s)
 {
@@ -1275,15 +1619,21 @@ handle_clean_connection(PoolAgent * agent, StringInfo s)
 
 	pq_getmsgend(s);
 
-	/* Clean up connections here */
+	/* perform the actual connection cleanup */
 	res = clean_connection(nodelist, database, user_name);
 
 	list_free(nodelist);
 
-	/* Send success result */
+	/* send result (success/failure) back */
 	pool_sendres(&agent->port, res);
 }
 
+/*
+ * handle_get_connections
+ *	  Acquire pooled connections to the specified nodes.
+ *
+ * The message is built and sent by PoolManagerGetConnections.
+ */
 static void
 handle_get_connections(PoolAgent * agent, StringInfo s)
 {
@@ -1294,22 +1644,26 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
 	List   *coordlist = NIL;
 
 	/*
-	 * Length of message is caused by:
-	 * - Message header = 4bytes
-	 * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
-	 * - List of Coordinators = NumPoolCoords * 4bytes (max)
-	 * - Number of Datanodes sent = 4bytes
-	 * - Number of Coordinators sent = 4bytes
-	 * It is better to send in a same message the list of Co and Dn at the same
-	 * time, this permits to reduce interactions between postmaster and pooler
+	 * The message consists of:
+	 *
+	 * - Message header = 4B
+	 * - Number of Datanodes sent = 4B
+	 * - List of Datanodes = NumPoolDataNodes * 4B (max)
+	 * - Number of Coordinators sent = 4B
+	 * - List of Coordinators = NumPoolCoords * 4B (max)
 	 */
+
 	pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
 
+	/* decode the datanode OIDs */
 	datanodecount = pq_getmsgint(s, 4);
 	for (i = 0; i < datanodecount; i++)
 		datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
 
-	/* It is possible that no Coordinators are involved in the transaction */
+	/*
+	 * decode the coordinator OIDs (there may be none, if no coordinators
+	 * are involved in the transaction)
+	 */
 	coordcount = pq_getmsgint(s, 4);
 	for (i = 0; i < coordcount; i++)
 		coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
@@ -1327,19 +1681,23 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
 	list_free(datanodelist);
 	list_free(coordlist);
 
+	/* Send the file descriptors back, along with the correct count. */
 	pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
 	if (fds)
 		pfree(fds);
 
-	/*
-	 * Also send the PIDs of the remote backend processes serving
-	 * these connections
-	 */
+	/* Also send PIDs of the remote backends serving the connections. */
 	pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
 	if (pids)
 		pfree(pids);
 }
 
+/*
+ * handle_query_cancel
+ *	  Cancel query executed on connections associated with the agent.
+ *
+ * PoolManagerCancelQuery
+ */
 static void
 handle_query_cancel(PoolAgent * agent, StringInfo s)
 {
@@ -1378,7 +1736,8 @@ handle_query_cancel(PoolAgent * agent, StringInfo s)
 }
 
 /*
- * Handle messages to agent
+ * agent_handle_input
+ *	  Handle messages passed to the pool agent from PoolerLoop().
  */
 static void
 agent_handle_input(PoolAgent * agent, StringInfo s)
@@ -1500,7 +1859,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
 }
 
 /*
- * acquire connection
+ * agent_acquire_connections
+ *		Acquire connections to specified nodes, associate them with agent.
+ *
+ * Returns an array of file descriptors representing the connections, with
+ * order matching the datanode/coordinator list. Also returns an array of
+ * backend PIDs, handling those connections (on the remote nodes).
  */
 static int *
 agent_acquire_connections(PoolAgent *agent, List *datanodelist,
@@ -1526,12 +1890,17 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 	}
 
 	/*
-	 * Allocate memory
-	 * File descriptors of Datanodes and Coordinators are saved in the same array,
-	 * This array will be sent back to the postmaster.
-	 * It has a length equal to the length of the Datanode list
-	 * plus the length of the Coordinator list.
-	 * Datanode fds are saved first, then Coordinator fds are saved.
+	 * Allocate memory for the file descriptors and backend PIDs.
+	 *
+	 * File descriptors of datanodes and coordinators are both saved in
+	 * a single array, which is then sent back to the backend. Datanodes
+	 * are stored first, coordinators second, and the order matches the
+	 * order of input lists.
+	 *
+	 * And similarly for the PIDs - single array, datanodes first.
+	 *
+	 * XXX How expensive is it to do the list_length over and over? Maybe
+	 * do the count once and then use the value elsewhere?
 	 */
 	result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
 	if (result == NULL)
@@ -1550,15 +1919,13 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 	}
 
 	/*
-	 * There are possible memory allocations in the core pooler, we want
-	 * these allocations in the contect of the database pool
+	 * Make sure the results (connections) are allocated in the memory
+	 * context for the DatabasePool.
 	 */
 	oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
 
-
-	/* Initialize result */
+	/* first open connections to the datanodes */
 	i = 0;
-	/* Save in array fds of Datanodes first */
 	foreach(nodelist_item, datanodelist)
 	{
 		int			node = lfirst_int(nodelist_item);
@@ -1586,6 +1953,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 			 * Update newly-acquired slot with session parameters.
 			 * Local parameters are fired only once BEGIN has been launched on
 			 * remote nodes.
+			 *
+			 * FIXME Perhaps we should be doing something here?
 			 */
 		}
 
@@ -1593,7 +1962,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 		(*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
 	}
 
-	/* Save then in the array fds for Coordinators */
+	/* make sure we got the expected number of datanode connections */
+	Assert(i == list_length(datanodelist));
+
+	/* and then the coordinators */
 	foreach(nodelist_item, coordlist)
 	{
 		int			node = lfirst_int(nodelist_item);
@@ -1620,6 +1992,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 			 * Update newly-acquired slot with session parameters.
 			 * Local parameters are fired only once BEGIN has been launched on
 			 * remote nodes.
+			 *
+			 * FIXME Perhaps we should be doing something here?
 			 */
 		}
 
@@ -1629,11 +2003,15 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 
 	MemoryContextSwitchTo(oldcontext);
 
+	/* make sure we got the expected total number of connections */
+	Assert(i == list_length(datanodelist) + list_length(coordlist));
+
 	return result;
 }
 
 /*
- * Cancel query
+ * cancel_query_on_connections
+ *	  Cancel query running on connections managed by a PoolAgent.
  */
 static int
 cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
@@ -1706,7 +2084,8 @@ cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlis
 }
 
 /*
- * Return connections back to the pool
+ * PoolManagerReleaseConnections
+ *	  Return all connections back to the pool.
  */
 void
 PoolManagerReleaseConnections(bool force)
@@ -1715,7 +2094,10 @@ PoolManagerReleaseConnections(bool force)
 	int n32;
 	int msglen = 8;
 
-	/* If disconnected from pooler all the connections already released */
+	/*
+	 * If disconnected from the pool manager, all the connections were
+	 * already released.
+	 */
 	if (!poolHandle)
 		return;
 
@@ -1735,7 +2117,8 @@ PoolManagerReleaseConnections(bool force)
 }
 
 /*
- * Cancel Query
+ * PoolManagerCancelQuery
+ *	  Cancel query on all nodes where it's running.
  */
 void
 PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
@@ -1794,7 +2177,10 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
 }
 
 /*
- * Release connections for Datanodes and Coordinators
+ * agent_release_connections
+ *	  Release connections associated with a PoolAgent instance.
+ *
+ * 
  */
 static void
 agent_release_connections(PoolAgent *agent, bool force_destroy)
@@ -1802,8 +2188,15 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
 	MemoryContext oldcontext;
 	int			i;
 
+	/* If there are no open connections in the agent, we're done. */
 	if (!agent->dn_connections && !agent->coord_connections)
 		return;
+
+	/*
+	 * In PAUSED cluster (see src/backend/pgxc/cluster/pause.c) we can't
+	 * return any connections to the connection pools, we can only close
+	 * them, so we require 'force'.
+	 */
 	if (!force_destroy && cluster_ex_lock_held)
 	{
 		elog(LOG, "Not releasing connection with cluster lock");
@@ -1811,29 +2204,33 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
 	}
 
 	/*
-	 * There are possible memory allocations in the core pooler, we want
-	 * these allocations in the contect of the database pool
+	 * Make sure all allocations happen in the DatabasePool memory context
+	 * (and not for example in the main pooler context, which would cause
+	 * memory leaks, or in caller's context, likely causing crashes).
 	 */
 	oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
 
 	/*
-	 * Remaining connections are assumed to be clean.
-	 * First clean up for Datanodes
+	 * All currently open connections are assumed to be 'clean' so just
+	 * return them back to the pool (or close them, with force_destroy).
+	 * First the datanodes, then coordinators.
 	 */
 	for (i = 0; i < agent->num_dn_connections; i++)
 	{
 		PGXCNodePoolSlot *slot = agent->dn_connections[i];
 
 		/*
-		 * Release connection.
+		 * Release the connection.
+		 *
 		 * If connection has temporary objects on it, destroy connection slot.
 		 */
 		if (slot)
 			release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
+
 		agent->dn_connections[i] = NULL;
 		elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
 	}
-	/* Then clean up for Coordinator connections */
+
 	for (i = 0; i < agent->num_coord_connections; i++)
 	{
 		PGXCNodePoolSlot *slot = agent->coord_connections[i];
@@ -1844,6 +2241,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
 		 */
 		if (slot)
 			release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
+
 		agent->coord_connections[i] = NULL;
 		elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
 	}
@@ -1851,7 +2249,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
 	/*
 	 * Released connections are now in the pool and we may want to close
 	 * them eventually. Update the oldest_idle value to reflect the latest
-	 * last access time if not already updated..
+	 * last access time if not already updated.
 	 */
 	if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
 		agent->pool->oldest_idle = time(NULL);
@@ -1859,13 +2257,24 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
 	MemoryContextSwitchTo(oldcontext);
 }
 
+
+/***********************************************************************
+ * Pool Management
+ **********************************************************************/
+
 /*
- * Create new empty pool for a database.
- * By default Database Pools have a size null so as to avoid interactions
- * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
- * Pool is increased at the first GET_CONNECTION message received.
- * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
- * error and POOL_WEXIST if poll for this database already exist.
+ * create_database_pool
+ *	  Create new empty pool for a database/user combination.
+ *
+ * We only initialize the database pool and add it to the global list,
+ * but do not try to preallocate any connections. That only happens when
+ * the first request for connection arrives.
+ *
+ * Returns a pointer to the new DatabasePool in case of success, NULL
+ * when something fails (out of memory, etc.)
+ *
+ * XXX Should we add some protection against duplicate pools? Probably
+ * not really necessary.
  */
 static DatabasePool *
 create_database_pool(const char *database, const char *user_name, const char *pgoptions)
@@ -1878,14 +2287,18 @@ create_database_pool(const char *database, const char *user_name, const char *pg
 	elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
 			" with pgoptions %s", database, user_name, pgoptions);
 
+	/* create a memory context for the database pool */
 	dbcontext = AllocSetContextCreate(PoolerCoreContext,
-									  "DB Context",
+									  "Database Pool Context",
 									  ALLOCSET_DEFAULT_MINSIZE,
 									  ALLOCSET_DEFAULT_INITSIZE,
 									  ALLOCSET_DEFAULT_MAXSIZE);
+
 	oldcontext = MemoryContextSwitchTo(dbcontext);
-	/* Allocate memory */
+
+	/* Allocate memory (already in the dbpool memory context) */
 	databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+
 	if (!databasePool)
 	{
 		/* out of memory */
@@ -1896,15 +2309,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
 	}
 
 	databasePool->mcxt = dbcontext;
-	 /* Copy the database name */
+
+	/* copy the basic details about the pool */
 	databasePool->database = pstrdup(database);
-	 /* Copy the user name */
 	databasePool->user_name = pstrdup(user_name);
-	/* Reset the oldest_idle value */
-	databasePool->oldest_idle = (time_t) 0;
-	 /* Copy the pgoptions */
 	databasePool->pgoptions = pstrdup(pgoptions);
 
+	/* reset the oldest_idle value */
+	databasePool->oldest_idle = (time_t) 0;
+
+	/* FIXME We should check all the parameters we just copied. */
 	if (!databasePool->database)
 	{
 		/* out of memory */
@@ -1931,7 +2345,7 @@ create_database_pool(const char *database, const char *user_name, const char *pg
 
 	MemoryContextSwitchTo(oldcontext);
 
-	/* Insert into the list */
+	/* insert the new database pool into the global list */
 	insert_database_pool(databasePool);
 
 	return databasePool;
@@ -1939,7 +2353,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
 
 
 /*
- * Destroy the pool and free memory
+ * destroy_database_pool
+ *	  Destroy a database pool for a user/dbname combination.
+ *
+ * When a matching database pool exists, we destroy all the node pools
+ * (which closes all the connection), and release the memory context.
+ *
+ * Returns 1 in case of success (when pool exists), 0 when a matching
+ * pool was not found.
+ *
+ * XXX Maybe return true/false instead?
  */
 static int
 destroy_database_pool(const char *database, const char *user_name)
@@ -1965,19 +2388,28 @@ destroy_database_pool(const char *database, const char *user_name)
 		MemoryContextDelete(databasePool->mcxt);
 		return 1;
 	}
+
+	elog(DEBUG1, "Connection pool for database %s, user %s not found",
+			database, user_name);
+
 	return 0;
 }
 
 
 /*
- * Insert new database pool to the list
+ * insert_database_pool
+ *	  Insert the newly created pool to the head of the global pool list.
  */
 static void
 insert_database_pool(DatabasePool *databasePool)
 {
 	Assert(databasePool);
 
-	/* Reference existing list or null the tail */
+	/*
+	 * Reference existing list or null the tail
+	 *
+	 * XXX The 'if' seems somewhat unnecessary I guess ...
+	 */
 	if (databasePools)
 		databasePool->next = databasePools;
 	else
@@ -1989,7 +2421,10 @@ insert_database_pool(DatabasePool *databasePool)
 
 /*
  * reload_database_pools
- *	rebuild connection information for all database pools
+ *	  Rebuild connection information for all database pools.
+ *
+ * Connection information reload applies to all database pools (not
+ * just the one associated with a the current pool agent).
  *
  * A database pool is reloaded as follows for each remote node:
  *
@@ -1999,21 +2434,22 @@ insert_database_pool(DatabasePool *databasePool)
  * - node pool is deleted if its port or host information is changed.
  *   Subsequently all its connections are dropped.
  *
- * - node pool is kept unchanged with existing connection information
- *   is not changed. However its index position in node pool is changed
- *   according to the alphabetical order of the node name in new
- *   cluster configuration.
+ * - node pool is kept unchanged if the connection information has not
+ *   changed. However its index position in node pool changes according
+ *   to the alphabetical order of the node name in new configuration.
  *
  * Backend sessions are responsible to reconnect to the pooler to update
  * their agent with newest connection information.
  *
- * The session invocating connection information reload is reconnected
- * and uploaded automatically after database pool reload. Other server
- * sessions are signaled to reconnect to pooler and update their
- * connection information separately.
+ * The session that triggered the connection metadata reload reconnects
+ * automatically after the reload. Other server sessions are signaled
+ * to reconnect to pooler and update their connection info separately.
  *
  * During reload process done internally on pooler, pooler is locked
  * to forbid new connection requests.
+ *
+ * XXX Where does the locking happen?
+ * XXX Where do we signal the other sessions?
  */
 static void
 reload_database_pools(PoolAgent *agent)
@@ -2023,26 +2459,29 @@ reload_database_pools(PoolAgent *agent)
 	elog(DEBUG1, "Reloading database pools");
 
 	/*
-	 * Release node connections if any held. It is not guaranteed client session
-	 * does the same so don't ever try to return them to pool and reuse
+	 * Release node connections if any held. It is not guaranteed client
+	 * session does the same so we don't ever try to return them to pool
+	 * for reuse, and instead just close them.
 	 */
 	agent_release_connections(agent, true);
 
 	/* Forget previously allocated node info */
 	MemoryContextReset(agent->mcxt);
 
-	/* and allocate new */
+	/* And allocate a blank copy. */
 	PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
-					&agent->num_coord_connections, &agent->num_dn_connections, false);
+					&agent->num_coord_connections, &agent->num_dn_connections,
+					false);
 
 	agent->coord_connections = (PGXCNodePoolSlot **)
 			palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+
 	agent->dn_connections = (PGXCNodePoolSlot **)
 			palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
 
 	/*
-	 * Scan the list and destroy any altered pool. They will be recreated
-	 * upon subsequent connection acquisition.
+	 * Scan the list of database pools and destroy any altered pool. The
+	 * pools will be recreated upon subsequent connection acquisition.
 	 */
 	databasePool = databasePools;
 	while (databasePool)
@@ -2074,18 +2513,19 @@ reload_database_pools(PoolAgent *agent)
 
 /*
  * refresh_database_pools
- *		refresh information for all database pools
+ *		Refresh information for all database pools.
+ *
+ * Connection information refresh applies to all database pools (not
+ * just the one associated with a the current pool agent).
  *
- * Connection information refresh concerns all the database pools.
  * A database pool is refreshed as follows for each remote node:
  *
  * - node pool is deleted if its port or host information is changed.
  *   Subsequently all its connections are dropped.
  *
- * If any other type of activity is found, we error out.
- *
- * XXX I don't see any cases that would error out. Isn't the comment
- * simply obsolete?
+ * If any other type of activity is found (e.g. removed or deleted node)
+ * we error out (and return POOL_REFRESH_FAILED). In case of success we
+ * return POOL_REFRESH_SUCCESS.
  */
 static int
 refresh_database_pools(PoolAgent *agent)
@@ -2117,7 +2557,7 @@ refresh_database_pools(PoolAgent *agent)
 
 	/*
 	 * Scan the list and destroy any altered pool. They will be recreated
-	 * upon subsequent connection acquisition.
+	 * automatically upon subsequent connection acquisition.
 	 */
 	databasePool = databasePools;
 	while (res == POOL_REFRESH_SUCCESS && databasePool)
@@ -2132,7 +2572,10 @@ refresh_database_pools(PoolAgent *agent)
 
 			/*
 			 * Since we re-checked the numbers above, we should not get
-			 * the case of an ADDED or a DELETED node here..
+			 * the case of an ADDED or a DELETED node here.
+			 *
+			 * Newly added nodes are detected indirectly (same node count
+			 * and no deleted nodes means no added nodes either).
 			 */
 			if (connstr_chk == NULL)
 			{
@@ -2145,10 +2588,10 @@ refresh_database_pools(PoolAgent *agent)
 			if (strcmp(connstr_chk, nodePool->connstr))
 			{
 				elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
+
 				/*
-				 * Node has been altered. First remove
-				 * all references to this node from ALL the
-				 * agents before destroying it..
+				 * Node has been altered. First remove all references to
+				 * this node from ALL the agents before destroying it.
 				 */
 				if (!remove_all_agent_references(nodePool->nodeoid))
 				{
@@ -2156,6 +2599,7 @@ refresh_database_pools(PoolAgent *agent)
 					break;
 				}
 
+				/* And now destroy the node pool. */
 				destroy_node_pool(nodePool);
 				hash_search(databasePool->nodePools, &nodePool->nodeoid,
 							HASH_REMOVE, NULL);
@@ -2167,9 +2611,17 @@ refresh_database_pools(PoolAgent *agent)
 
 		databasePool = databasePool->next;
 	}
+
 	return res;
 }
 
+/*
+ * remove_all_agent_references
+ *	  Remove all references to a specified node from all PoolAgents.
+ *
+ * XXX This is yet another place unnecesserily complicated by keeping
+ * datanodes and coordinators separate.
+ */
 static bool
 remove_all_agent_references(Oid nodeoid)
 {
@@ -2177,8 +2629,7 @@ remove_all_agent_references(Oid nodeoid)
 	bool res = true;
 
 	/*
-	 * Identify if it's a coordinator or datanode first
-	 * and get its index
+	 * Identify if it's a coordinator or datanode first and get its index.
 	 */
 	for (i = 1; i <= agentCount; i++)
 	{
@@ -2228,14 +2679,20 @@ remove_all_agent_references(Oid nodeoid)
 }
 
 /*
- * Find pool for specified database and username in the list
+ * find_database_pool
+ *	  Find a DatabasePool for specified database/username combination.
+ *
+ * Returns a pointer to the database pool if it exists, NULL otherwise.
  */
 static DatabasePool *
-find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+find_database_pool(const char *database, const char *user_name,
+				   const char *pgoptions)
 {
 	DatabasePool *databasePool;
 
-	/* Scan the list */
+	Assert(database && user_name && pgoptions);
+
+	/* scan the list */
 	databasePool = databasePools;
 	while (databasePool)
 	{
@@ -2243,14 +2700,21 @@ find_database_pool(const char *database, const char *user_name, const char *pgop
 			strcmp(user_name, databasePool->user_name) == 0 &&
 			strcmp(pgoptions, databasePool->pgoptions) == 0)
 			break;
+
 		databasePool = databasePool->next;
 	}
+
 	return databasePool;
 }
 
 
 /*
- * Remove pool for specified database from the list
+ * remove_database_pool
+ *	  Remove database pool for database/username combination from the list.
+ *
+ * Only removes the pool from the global list, but does not destroy it.
+ * This allows doing additional maintenance on the database pool (e.g.
+ * destroy all the node pools, etc.)
  */
 static DatabasePool *
 remove_database_pool(const char *database, const char *user_name)
@@ -2258,21 +2722,24 @@ remove_database_pool(const char *database, const char *user_name)
 	DatabasePool *databasePool,
 			   *prev;
 
+	Assert(database && user_name);
+
 	/* Scan the list */
 	databasePool = databasePools;
 	prev = NULL;
 	while (databasePool)
 	{
 
-		/* if match break the loop and return */
+		/* if the pool matches, break the loop */
 		if (strcmp(database, databasePool->database) == 0 &&
 			strcmp(user_name, databasePool->user_name) == 0)
 			break;
+
 		prev = databasePool;
 		databasePool = databasePool->next;
 	}
 
-	/* if found */
+	/* if found a matching pool, remove it from the list */
 	if (databasePool)
 	{
 
@@ -2285,11 +2752,29 @@ remove_database_pool(const char *database, const char *user_name)
 
 		databasePool->next = NULL;
 	}
+	else
+		elog(LOG, "database pool for %s/%s not found",
+			 database, user_name);
+
+
 	return databasePool;
 }
 
 /*
- * Acquire connection
+ * acquire_connection
+ *		Acquire connection to a given node from a specified pool.
+ *
+ * The node connection is acquired in one of two ways:
+ *
+ * (a) By reusing a connection already available in the connection pool.
+ *
+ * (b) By opening a fresh connection (when freeSize==0).
+ *
+ * Returns a PGXCNodePoolSlot pointer in case of success, NULL when the
+ * connection can't be obtained.
+ *
+ * Also updates node health information in the shared memory, both in
+ * case of success (healthy) or failure (unhealthy).
  */
 static PGXCNodePoolSlot *
 acquire_connection(DatabasePool *dbPool, Oid node)
@@ -2298,21 +2783,25 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 	PGXCNodePoolSlot   *slot;
 
 	Assert(dbPool);
+	Assert(OidIsValid(node));
 
-	nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
-											NULL);
+	/* see if we have pool for the node */
+	nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+											HASH_FIND, NULL);
 
 	/*
-	 * When a Coordinator pool is initialized by a Coordinator Postmaster,
-	 * it has a NULL size and is below minimum size that is 1
-	 * This is to avoid problems of connections between Coordinators
-	 * when creating or dropping Databases.
+	 * If there are no free connections in the node pool, grow it.
+	 *
+	 * Coordinator pools initialized by a coordinator postmaster are
+	 * initially empty. This is to avoid problems of connections between
+	 * coordinators when creating or dropping databases.
 	 */
 	if (nodePool == NULL || nodePool->freeSize == 0)
 		nodePool = grow_pool(dbPool, node);
 
 	slot = NULL;
-	/* Check available connections */
+
+	/* check available connections */
 	while (nodePool && nodePool->freeSize > 0)
 	{
 		int			poll_result;
@@ -2323,14 +2812,26 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 		if (PQsocket((PGconn *) slot->conn) > 0)
 		{
 			/*
-			 * Make sure connection is ok, destroy connection slot if there is a
-			 * problem.
+			 * Check if the connection is ok, destroy the connection
+			 * slot if there is a problem.
+			 *
+			 * XXX Not sure how expensive this is, but perhaps we should
+			 * check the connections differently (not in the hot path
+			 * when requesting the connection, when every instruction
+			 * makes a difference). This seems particularly pointless
+			 * when the connection was just opened by grow_pool().
+			 *
+			 * XXX Perhaps we can do this only when the connection is
+			 * old enough (e.g. using slot->released)?
 			 */
 			poll_result = pqReadReady((PGconn *) slot->conn);
 
+			/* ok, no data - we have a working connection */
 			if (poll_result == 0)
-				break; 		/* ok, no data */
-			else if (poll_result < 0)
+				break;
+
+			/* something went wrong - retry, if possible */
+			if (poll_result < 0)
 			{
 				if (errno == EAGAIN || errno == EINTR)
 					goto retry;
@@ -2346,6 +2847,7 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 
 		/* Decrement current max pool size */
 		(nodePool->size)--;
+
 		/* Ensure we are not below minimum size */
 		nodePool = grow_pool(dbPool, node);
 	}
@@ -2355,8 +2857,8 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 		elog(WARNING, "can not connect to node %u", node);
 
 		/*
-		 * before returning, also update the shared health
-		 * status field to indicate that this node is down
+		 * Before returning, update the node health status in shared
+		 * memory to indicate this node is down.
 		 */
 		if (!PgxcNodeUpdateHealth(node, false))
 			elog(WARNING, "Could not update health status of node %u", node);
@@ -2364,6 +2866,10 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 			elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
 	}
 	else
+		/*
+		 * XXX Is this necessary? Isn't this just another source of latency
+		 * in the connection-acquisition path?
+		 */
 		PgxcNodeUpdateHealth(node, true);
 
 	return slot;
@@ -2371,7 +2877,13 @@ acquire_connection(DatabasePool *dbPool, Oid node)
 
 
 /*
- * release connection from specified pool and slot
+ * release_connection
+ *	  Return a connection to a pool, or close it entirely.
+ *
+ * Release a connection - either return it back to the database pool
+ * (or more precisely to the node pool in that database pool), or force
+ * closing it (necessary for example when the session fails and we are
+ * not sure whether the connection is in consistent state).
  */
 static void
 release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
@@ -2381,40 +2893,61 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 
 	Assert(dbPool);
 	Assert(slot);
+	Assert(OidIsValid(node));
+
+	nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+											HASH_FIND, NULL);
 
-	nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
-											NULL);
+	/*
+	 * When the node pool does not exist, the node was probably either
+	 * dropped or altered. In both cases the connection is no longer
+	 * valid, so just close it.
+	 */
 	if (nodePool == NULL)
 	{
-		/*
-		 * The node may be altered or dropped.
-		 * In any case the slot is no longer valid.
-		 */
+		elog(WARNING, "Node pool (%d) does not exist anymore, closing connection",
+			node);
+
 		destroy_slot(slot);
 		return;
 	}
 
-	/* return or discard */
+	/*
+	 * The node pool exists, but we've been asked to forcefully close
+	 * the connection, so do as asked.
+	 */
 	if (!force_destroy)
 	{
-		/* Insert the slot into the array and increase pool size */
-		nodePool->slot[(nodePool->freeSize)++] = slot;
-		slot->released = time(NULL);
-	}
-	else
-	{
-		elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+		elog(DEBUG1, "Cleaning up connection from pool %s (node %d), closing",
+			nodePool->connstr, node);
+
 		destroy_slot(slot);
+
 		/* Decrement pool size */
 		(nodePool->size)--;
+
 		/* Ensure we are not below minimum size */
 		grow_pool(dbPool, node);
+
+		return;
 	}
+
+	/*
+	 * Everything peachy, so just insert the connection (slot) into the
+	 * array and increase the number of free connections in the pool.
+	 * Also note the timestamp when the connection was released.
+	 */
+	nodePool->slot[(nodePool->freeSize)++] = slot;
+	slot->released = time(NULL);
 }
 
 
 /*
- * Increase database pool size, create new if does not exist
+ * grow_pool
+ *	  Increase size of a pool for a particular node if needed.
+ *
+ * If the node pool (for the specified node) does not exist, it will be
+ * created automatically.
  */
 static PGXCNodePool *
 grow_pool(DatabasePool *dbPool, Oid node)
@@ -2425,10 +2958,18 @@ grow_pool(DatabasePool *dbPool, Oid node)
 	bool			found;
 
 	Assert(dbPool);
+	Assert(OidIsValid(node));
 
+	/* lookup node pool, create it if needed */
 	nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
 											HASH_ENTER, &found);
+
+	/*
+	 * XXX Aren't we calling this even when the connstr already exists?
+	 * Seems a bit wasteful, I guess.
+	 */
 	nodePool->connstr = build_node_conn_str(node, dbPool);
+
 	if (!nodePool->connstr)
 	{
 		ereport(ERROR,
@@ -2436,6 +2977,10 @@ grow_pool(DatabasePool *dbPool, Oid node)
 				 errmsg("could not build connection string for node %u", node)));
 	}
 
+	/*
+	 * XXX Shouldn't this really be called right after the hash_search
+	 * (and before we do the build_node_conn_str)?
+	 */
 	if (!found)
 	{
 		nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
@@ -2449,6 +2994,11 @@ grow_pool(DatabasePool *dbPool, Oid node)
 		nodePool->size = 0;
 	}
 
+	/*
+	 * If there are no free connections, try to create one. But do not
+	 * exceed MaxPoolSize, i.e. the maximum number of connections in
+	 * a node pool.
+	 */
 	while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
 	{
 		PGXCNodePoolSlot *slot;
@@ -2475,16 +3025,22 @@ grow_pool(DatabasePool *dbPool, Oid node)
 						  " connection error (%s)",
 						  nodePool->connstr,
 						  PQerrorMessage((PGconn*) slot->conn))));
+
 			destroy_slot(slot);
+
 			/*
-			 * If we failed to connect probably number of connections on the
-			 * target node reached max_connections. Try and release idle
-			 * connections and try again.
-			 * We do not want to enter endless loop here and run maintenance
-			 * procedure only once.
-			 * It is not safe to run the maintenance procedure if no connections
-			 * from that pool currently in use - the node pool may be destroyed
-			 * in that case.
+			 * If we failed to connect, probably number of connections on
+			 * the target node reached max_connections. Release idle from
+			 * this node, and retry.
+			 *
+			 * We do not want to enter endless loop here, so we only try
+			 * releasing idle connections once.
+			 *
+			 * It is not safe to run the maintenance from a pool with no
+			 * active connections, as the maintenance might kill the pool.
+			 *
+			 * XXX Maybe temporarily marking the pool, so that it does not
+			 * get removed (pinned=true) would do the trick?
 			 */
 			if (tryagain && nodePool->size > nodePool->freeSize)
 			{
@@ -2497,24 +3053,34 @@ grow_pool(DatabasePool *dbPool, Oid node)
 
 		slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
 		slot->released = time(NULL);
+
+		/*
+		 * No need to compare the oldest_idle here, as every existing
+		 * idle connection is automatically older than the new one. Only
+		 * if there are no other idle connections this one is the oldest.
+		 */
 		if (dbPool->oldest_idle == (time_t) 0)
 			dbPool->oldest_idle = slot->released;
 
-		/* Insert at the end of the pool */
+		/* Insert the new slot to the last place in the node pool. */
 		nodePool->slot[(nodePool->freeSize)++] = slot;
 
-		/* Increase count of pool size */
+		/* Increase the size of the node pool. */
 		(nodePool->size)++;
-		elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+
+		elog(DEBUG1, "Pooler: increased pool size to %d for pool %s (%u)",
 			 nodePool->size,
-			 nodePool->connstr);
+			 nodePool->connstr,
+			 node);
 	}
+
 	return nodePool;
 }
 
 
 /*
- * Destroy pool slot
+ * destroy_slot
+ *	  Destroy a connection slot (free cancel info and the slot itself).
  */
 static void
 destroy_slot(PGXCNodePoolSlot *slot)
@@ -2529,7 +3095,10 @@ destroy_slot(PGXCNodePoolSlot *slot)
 
 
 /*
- * Destroy node pool
+ * destroy_node_pool
+ *	  Close any remaining connections to the node and destroy the slots.
+ *
+ * XXX This does not release the node_pool itself. Not sure if correct.
  */
 static void
 destroy_node_pool(PGXCNodePool *node_pool)
@@ -2546,6 +3115,7 @@ destroy_node_pool(PGXCNodePool *node_pool)
 	 */
 	elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
 		 node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+
 	if (node_pool->connstr)
 		pfree(node_pool->connstr);
 
@@ -2553,13 +3123,21 @@ destroy_node_pool(PGXCNodePool *node_pool)
 	{
 		for (i = 0; i < node_pool->freeSize; i++)
 			destroy_slot(node_pool->slot[i]);
+
 		pfree(node_pool->slot);
 	}
 }
 
 
 /*
- * Main handling loop
+ * PoolerLoop
+ *	  Main handling loop of the pool manager.
+ *
+ * Has three main responsibilities:
+ * 
+ * - triggering regular pool maintenance
+ * - responding to postmaster events (e.g. shutdown)
+ * - forwarding messages to pool agents (which do handle them)
  */
 static void
 PoolerLoop(void)
@@ -2725,7 +3303,7 @@ PoolerLoop(void)
 			/*
 			 * Agent may be removed from the array while processing
 			 * and trailing items are shifted, so scroll downward
-			 * to avoid problem
+			 * to avoid problems.
 			 */
 			for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
 			{
@@ -2737,6 +3315,7 @@ PoolerLoop(void)
 					agent_handle_input(agent, &input_message);
 			}
 
+			/* New session without an existing agent. */
 			if (pool_fd[0].revents & POLLIN)
 				agent_create();
 		}
@@ -2751,9 +3330,18 @@ PoolerLoop(void)
 }
 
 /*
- * Clean Connection in all Database Pools for given Datanode and Coordinator list
+ * clean_connection
+ *	  Clean connections for specified nodes in matching database pool.
+ *
+ * The function closes all unused connections to nodes specified in the
+ * node_discard list, in all database pools for the dbname/username
+ * combination. There may be multiple matching pools, with different
+ * pgoptions values.
+ *
+ * XXX The code handles NULL values in database/username, but not sure
+ * if that's really needed?
  */
-int
+static int
 clean_connection(List *node_discard, const char *database, const char *user_name)
 {
 	DatabasePool *databasePool;
@@ -2766,7 +3354,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
 		ListCell *lc;
 
 		if ((database && strcmp(database, databasePool->database)) ||
-				(user_name && strcmp(user_name, databasePool->user_name)))
+			(user_name && strcmp(user_name, databasePool->user_name)))
 		{
 			/* The pool does not match to request, skip */
 			databasePool = databasePool->next;
@@ -2789,12 +3377,12 @@ clean_connection(List *node_discard, const char *database, const char *user_name
 				/* Check if connections are in use */
 				if (nodePool->freeSize < nodePool->size)
 				{
-					elog(WARNING, "Pool of Database %s is using Datanode %u connections",
+					elog(WARNING, "Pool of database %s is using node %u connections",
 								databasePool->database, node);
 					res = CLEAN_CONNECTION_NOT_COMPLETED;
 				}
 
-				/* Destroy connections currently in Node Pool */
+				/* Destroy unused connections in this Node Pool */
 				if (nodePool->slot)
 				{
 					int i;
@@ -2806,6 +3394,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
 			}
 		}
 
+		/* XXX Can there be multiple database pools? */
 		databasePool = databasePool->next;
 	}
 
@@ -2815,11 +3404,14 @@ clean_connection(List *node_discard, const char *database, const char *user_name
 }
 
 /*
- * Take a Lock on Pooler.
- * Abort PIDs registered with the agents for the given database.
- * Send back to client list of PIDs signaled to watch them.
+ * abort_pids
+ *	  Aborts backends associated with agents for a database/user.
+ *
+ * Ignores the current backend (otherwise it might cancel itself), and
+ * returns an array of PIDs that were actually signalled, so that the
+ * client can watch them. Number of the PIDs is passed in 'len'.
  */
-int *
+static int *
 abort_pids(int *len, int pid, const char *database, const char *user_name)
 {
 	int *pids = NULL;
@@ -2858,7 +3450,7 @@ abort_pids(int *len, int pid, const char *database, const char *user_name)
 }
 
 /*
- *
+ * Request shutdown of the pooler.
  */
 static void
 pooler_die(SIGNAL_ARGS)
@@ -2868,7 +3460,7 @@ pooler_die(SIGNAL_ARGS)
 
 
 /*
- *
+ * Request quick shutdown of the pooler.
  */
 static void
 pooler_quickdie(SIGNAL_ARGS)
@@ -2877,7 +3469,9 @@ pooler_quickdie(SIGNAL_ARGS)
 	exit(2);
 }
 
-
+/*
+ * Note that the pooler received SIGHUP signal.
+ */
 static void
 pooler_sighup(SIGNAL_ARGS)
 {
@@ -2885,8 +3479,13 @@ pooler_sighup(SIGNAL_ARGS)
 }
 
 /*
- * Given node identifier, dbname and user name build connection string.
- * Get node connection details from the shared memory node table
+ * build_node_conn_str
+ *	  Construct a connection string for the specified node.
+ *
+ * Given node OID and pool (which includes dbname and username strings),
+ * build the node connection string.
+ *
+ * May return NULL if the node got deleted, for example.
  */
 static char *
 build_node_conn_str(Oid node, DatabasePool *dbPool)
@@ -2914,10 +3513,13 @@ build_node_conn_str(Oid node, DatabasePool *dbPool)
 }
 
 /*
- * Check all pooled connections, and close which have been released more then
- * PooledConnKeepAlive seconds ago.
- * Return true if shrink operation closed all the connections and pool can be
- * ddestroyed, false if there are still connections or pool is in use.
+ * shrink_pool
+ *	  Close connections unused for more than PooledConnKeepAlive seconds.
+ *
+ * Returns true if shrink operation closed all the connections and the
+ * whole database pool can be destroyed, false if there are still open
+ * connections (in at least one node pool) or if the pool is in use
+ * (that is, if there are pool agents still referencing this pool).
  */
 static bool
 shrink_pool(DatabasePool *pool)
@@ -2991,8 +3593,13 @@ shrink_pool(DatabasePool *pool)
 
 
 /*
- * Scan connection pools and release connections which are idle for long.
- * If pool gets empty after releasing connections it is destroyed.
+ * pools_maintenance
+ *	  Perform regular maintenance of the connection pools.
+ *
+ * Scan connection pools and release connections which are idle for too
+ * long (longer than PoolConnKeepAlive). If the node pool gets empty
+ * after releasing idle connections it is destroyed (but only if not
+ * used by any pool agent).
  */
 static void
 pools_maintenance(void)
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 7ad15c7c6a..13b52e802c 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -175,11 +175,9 @@ extern int	pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timesta
 extern bool	pgxc_node_receive(const int conn_count,
 				  PGXCNodeHandle ** connections, struct timeval * timeout);
 extern int	pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error);
-extern int	pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
 
 extern int	send_some(PGXCNodeHandle * handle, int len);
 extern int	pgxc_node_flush(PGXCNodeHandle *handle);
-extern void	pgxc_node_flush_read(PGXCNodeHandle *handle);
 
 extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
 
@@ -202,4 +200,8 @@ extern bool PgxcNodeDiffBackendHandles(List **nodes_alter,
 			   List **nodes_delete, List **nodes_add);
 extern void PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter);
 extern void HandlePoolerMessages(void);
+
+/* Check health of nodes in the connection pool. */
+extern void PoolPingNodeRecheck(Oid nodeoid);
+
 #endif /* PGXCNODE_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 47a54c67b2..3c2d1f4eb2 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -1,15 +1,24 @@
 /*-------------------------------------------------------------------------
  *
  * poolmgr.h
- *
- *	  Definitions for the Datanode connection pool.
+ *	  Definitions for the built-in Postgres-XL connection pool.
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
  *
- * src/include/pgxc/poolmgr.h
+ *
+ * XXX Some function take list of nodes, others accept array + nitems.
+ * We should make this more consistent.
+ *
+ * XXX PoolPingNodes is defined on a number of places, including some .c
+ * files. We should define it on one place (pgxcnode.h?) and then include
+ * the header wherever needed.
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/pgxc/poolmgr.h
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +35,14 @@
 
 #define MAX_IDLE_TIME 60
 
-/* Connection pool entry */
+/*
+ * One connection in the pool (to datanode or coordinator).
+ *
+ * Essentially a PGconn+PGcancel, so that we can talk to the remote node
+ * and also forward a cancel request if needed.
+ *
+ * XXX rename to PooledConnection.
+ */
 typedef struct
 {
 	time_t		released;
@@ -34,33 +50,55 @@ typedef struct
 	NODE_CANCEL	*xc_cancelConn;
 } PGXCNodePoolSlot;
 
-/* Pool of connections to specified pgxc node */
+/*
+ * Pool of open connections to single node (datanode or coordinator).
+ *
+ * All the connections share the same connection string, and are tracked
+ * in a simple array of connections.
+ *
+ * XXX rename to NodePool.
+ * XXX not sure if "size" means "valid entries" or "maximum entries".
+ * XXX use FLEXIBLE_ARRAY_MEMBER
+ * XXX or maybe use simple lists of available/free connections instead?
+ */
 typedef struct
 {
-	Oid			nodeoid;	/* Node Oid related to this pool */
-	char	   *connstr;
+	Oid			nodeoid;	/* node Oid related to this pool */
+	char	   *connstr;	/* connection string for all the connections */
 	int			freeSize;	/* available connections */
-	int			size;  		/* total pool size */
+	int			size;  		/* total pool size (available slots) */
+
+	/* array of open connections (with freeSize available connections) */
 	PGXCNodePoolSlot **slot;
 } PGXCNodePool;
 
-/* All pools for specified database */
+/*
+ * A group of per-node connection pools (PGXCNodePool), for a particular
+ * database/user combination. We have one PGXCNodePool for each remote
+ * node (datanode or coordinator).
+ *
+ * If there are multiple such combinations (e.g. when there are multiple
+ * users accessing the same database), there will be multiple DatabasePool
+ * entries, organized in a linked list.
+ */
 typedef struct databasepool
 {
 	char	   *database;
 	char	   *user_name;
 	char	   *pgoptions;		/* Connection options */
-	HTAB	   *nodePools; 		/* Hashtable of PGXCNodePool, one entry for each
-								 * Coordinator or DataNode */
+	HTAB	   *nodePools; 		/* hashtable, one entry per remote node */
 	MemoryContext mcxt;
 	struct databasepool *next; 	/* Reference to next to organize linked list */
 	time_t		oldest_idle;
 } DatabasePool;
 
 /*
- * Agent of client session (Pool Manager side)
- * Acts as a session manager, grouping connections together
- * and managing session parameters
+ * Agent, managing a single client session on PoolManager side.
+ *
+ * Is responsible for:
+ *
+ * - tracking which connections are assigned to the session
+ * - managing parameters (GUCs) set in the session
  */
 typedef struct
 {
@@ -74,20 +112,17 @@ typedef struct
 	int				num_coord_connections;
 	Oid		   	   *dn_conn_oids;		/* one for each Datanode */
 	Oid		   	   *coord_conn_oids;	/* one for each Coordinator */
-	PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
+	PGXCNodePoolSlot **dn_connections;	/* one for each Datanode */
 	PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
 } PoolAgent;
+
 /*
- * Helper to poll for all pooler sockets
+ * Configuration parameters (GUCs).
  */
-typedef struct pollfd Pollfd;
-
-
 extern int	PoolConnKeepAlive;
 extern int	PoolMaintenanceTimeout;
 extern int	MaxPoolSize;
 extern int	PoolerPort;
-
 extern bool PersistentConnections;
 
 /* Status inquiry functions */
@@ -97,53 +132,48 @@ extern bool IsPGXCPoolerProcess(void);
 /* Initialize internal structures */
 extern int	PoolManagerInit(void);
 
-/* Destroy internal structures */
-extern int	PoolManagerDestroy(void);
-
 /*
- * Gracefully close connection to the PoolManager
+ * Gracefully close the PoolManager connection.
  */
 extern void PoolManagerDisconnect(void);
-extern char *session_options(void);
 
 /*
- * Reconnect to pool manager
- * This simply does a disconnection followed by a reconnection.
+ * Returns list of options to be propagated to the remote node(s).
  */
-extern void PoolManagerReconnect(void);
+extern char *session_options(void);
 
-/* Get pooled connections */
+/* Get pooled connections to specified nodes */
 extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist,
 		int **pids);
 
-/* Clean pool connections */
-extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username);
+/* Clean connections for the specified nodes (for dbname/user). */
+extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+		char *dbname, char *username);
 
-/* Check consistency of connection information cached in pooler with catalogs */
+/* Check that connections cached in the connection poole match catalogs. */
 extern bool PoolManagerCheckConnectionInfo(void);
 
-/* Reload connection data in pooler and drop all the existing connections of pooler */
+/* Reload connection data in pooler (and close all existing connections). */
 extern void PoolManagerReloadConnectionInfo(void);
 
-/* Refresh connection data in pooler and drop connections of altered nodes in pooler */
+/* Reload connection data in pooler and close connections to modified nodes). */
 extern int PoolManagerRefreshConnectionInfo(void);
 
-/* Send Abort signal to transactions being run */
-extern int	PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
-
-/* Return connections back to the pool, for both Coordinator and Datanode connections */
+/* Return all connections (for the session) back to the pool. */
 extern void PoolManagerReleaseConnections(bool destroy);
 
-/* Cancel a running query on Datanodes as well as on other Coordinators */
-extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list);
+/* Send "abort transaction" signal to transactions being run */
+extern int	PoolManagerAbortTransactions(char *dbname, char *username,
+		int **proc_pids);
 
-/* Lock/unlock pool manager */
-extern void PoolManagerLock(bool is_lock);
+/* Cancel a running query on all participating nodes (pg_cancel_backend). */
+extern void PoolManagerCancelQuery(int dn_count, int* dn_list,
+								   int co_count, int* co_list);
 
-/* Do pool health check activity */
+/* Check health of nodes in the connection pool. */
 extern void PoolPingNodes(void);
-extern void PoolPingNodeRecheck(Oid nodeoid);
 
 extern bool check_persistent_connections(bool *newval, void **extra,
 		GucSource source);
+
 #endif
author	Tomas Vondra	2017-10-22 13:00:06 +0000
committer	Tomas Vondra	2017-11-04 16:19:06 +0000
commit	d9f45c9018ec3ec1fc11e4be2be7f9728a1799b1 (patch)
tree	0fff4f84acb8765159714ad0b404f4927fa8a9a4
parent	cca8700e364c1031eb360de3ec16eba45152e01c (diff)