summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomas Vondra2017-10-22 13:00:06 +0000
committerTomas Vondra2017-11-04 16:19:06 +0000
commitd9f45c9018ec3ec1fc11e4be2be7f9728a1799b1 (patch)
tree0fff4f84acb8765159714ad0b404f4927fa8a9a4
parentcca8700e364c1031eb360de3ec16eba45152e01c (diff)
Comments and cleanup in the connection pool manager
Similarly to a39b06b0c6, this does minor cleanup in the pool manager code by removing unused functions and adding a lot of comments, both at the file level (explaining the concepts and basic API methods) and for individual functions.
-rw-r--r--src/backend/pgxc/pool/pgxcnode.c675
-rw-r--r--src/backend/pgxc/pool/poolmgr.c1289
-rw-r--r--src/include/pgxc/pgxcnode.h6
-rw-r--r--src/include/pgxc/poolmgr.h118
4 files changed, 1460 insertions, 628 deletions
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 66b993f53b..a664cc22da 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1,9 +1,103 @@
/*-------------------------------------------------------------------------
*
* pgxcnode.c
+ * Functions for communication with nodes through pooled connections.
*
- * Functions for the Coordinator communicating with the PGXC nodes:
- * Datanodes and Coordinators
+ * This is mostly a backend-side counterpart to the pool manager. Each
+ * session acquires connections to remote nodes, and uses them to execute
+ * queries.
+ *
+ * Currently, we only allow a single connection to each remote node. If
+ * a query includes multiple nodes that communicate with a given remote
+ * node (e.g. Append with multiple RemoteSubquery children), then the
+ * connection may need to be buffered (see BufferConnection).
+ *
+ * Following is an overview of the basic methods for node management and
+ * communication over the handles.
+ *
+ *
+ * node handle management
+ * ----------------------
+ * get_any_handle - acquire handle for replicated table
+ * get_handles - acquire handles to all specified nodes
+ * get_current_handles - return already acquired handles
+ * release_handles - release all connection (back to pool)
+ *
+ *
+ * connection functions (TODO move to poolmgr.c)
+ * --------------------
+ * PGXCNodeConnect - open libpq connection using connection string
+ * PGXCNodePing - ping node using connection string
+ * PGXCNodeClose - close libpq connection
+ * PGXCNodeConnected - verify connection status
+ * PGXCNodeConnStr - build connection string
+ *
+ *
+ * node handle management
+ * ----------------------
+ * PGXCNodeGetNodeOid - OID for node by index in handle array
+ * PGXCNodeGetNodeIdFromName - determine index in handle array by name
+ * PGXCNodeGetNodeId - determine index in handle array from OID
+ *
+ *
+ * session/transaction parameters
+ * ------------------------------
+ * PGXCNodeSetParam - add new parameter
+ * PGXCNodeResetParams - reset (local or session) parameters
+ * PGXCNodeGetTransactionParamStr - generate SET with transaction params
+ * PGXCNodeGetSessionParamStr - generate SET with session params
+ *
+ *
+ * low-level TCP buffer access
+ * ---------------------------
+ * pgxc_node_receive - receive data into input buffers for connections
+ * pgxc_node_read_data - read data for one particular connection
+ * get_message - read one complete message from a handle
+ * send_some - send a chunk of data to remote node
+ *
+ *
+ * send higher-level messages to remote node
+ * -----------------------------------------
+ * pgxc_node_send_parse - sends PARSE (part of extended protocol)
+ * pgxc_node_send_bind - sends BIND (part of extended protocol)
+ * pgxc_node_send_describe - sends DESCRIBE (part of extended protocol)
+ * pgxc_node_send_execute - sends EXECUTE (part of extended protocol)
+ * pgxc_node_send_flush - sends FLUSH (part of extended protocol)
+ * pgxc_node_send_close - sends close (C)
+ * pgxc_node_send_sync - sends sync (S)
+ * pgxc_node_send_query - simple query protocol (Q)
+ * pgxc_node_send_rollback - simple query on failed connection (Q)
+ * pgxc_node_send_query_extended - extended query protocol (PARSE, ...)
+ *
+ *
+ * XL-specific messages to remote nodes
+ * ------------------------------------
+ * pgxc_node_send_plan - sends plan to remote node (p)
+ * pgxc_node_send_gxid - sends GXID to remote node (g)
+ * pgxc_node_send_cmd_id - sends CommandId to remote node (M)
+ * pgxc_node_send_snapshot - sends snapshot to remote node (s)
+ * pgxc_node_send_timestamp - sends timestamp to remote node (t)
+ *
+ *
+ * misc functions
+ * --------------
+ * pgxc_node_set_query - send SET by simple protocol, wait for "ready"
+ * pgxc_node_flush - flush all data from the output buffer
+ *
+ *
+ * XXX We should add the custom messages (gxid, snapshot, ...) to the SGML
+ * documentation describing message formats.
+ *
+ * XXX What about using simple list, instead of the arrays? Or define new
+ * structure grouping all the important parameters (buffer, size, maxsize).
+ *
+ * XXX The comments claim that dn_handles and co_handles are allocated in
+ * Transaction context, but in fact those are allocated in TopMemoryContext.
+ * Otherwise we wouldn't be able to use persistent connections, which keeps
+ * connections for the whole session.
+ *
+ * XXX The comment at pgxc_node_free mentions TopTransactionContext, so
+ * perhaps we should consider using that?
*
*
* Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -11,8 +105,7 @@
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
- * $$
- *
+ * src/backend/pgxc/pool/pgxcnode.c
*
*-------------------------------------------------------------------------
*/
@@ -31,24 +124,28 @@
#include <string.h>
#include <unistd.h>
#include <errno.h>
+
#include "access/gtm.h"
#include "access/transam.h"
#include "access/xact.h"
#include "access/htup_details.h"
#include "catalog/pg_type.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pgxc_node.h"
#include "commands/prepare.h"
#include "gtm/gtm_c.h"
+#include "miscadmin.h"
#include "nodes/nodes.h"
-#include "pgxc/pgxcnode.h"
#include "pgxc/execRemote.h"
-#include "catalog/pgxc_node.h"
-#include "catalog/pg_collation.h"
#include "pgxc/locator.h"
#include "pgxc/nodemgr.h"
+#include "pgxc/pause.h"
#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
#include "pgxc/poolmgr.h"
-#include "tcop/dest.h"
+#include "storage/ipc.h"
#include "storage/lwlock.h"
+#include "tcop/dest.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/memutils.h"
@@ -57,14 +154,9 @@
#include "utils/syscache.h"
#include "utils/lsyscache.h"
#include "utils/formatting.h"
+#include "utils/snapmgr.h"
#include "utils/tqual.h"
#include "../interfaces/libpq/libpq-fe.h"
-#ifdef XCP
-#include "miscadmin.h"
-#include "storage/ipc.h"
-#include "pgxc/pause.h"
-#include "utils/snapmgr.h"
-#endif
#define CMD_ID_MSG_LEN 8
@@ -73,35 +165,29 @@ static int datanode_count = 0;
static int coord_count = 0;
/*
- * Datanode handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Datanodes.
- */
-static PGXCNodeHandle *dn_handles = NULL;
-
-/*
- * Coordinator handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Coordinators
+ * Datanode and coordinator handles (sockets obtained from the pooler),
+ * initialized in the TopMemoryContext memory context. Those connections
+ * are used during query execution to communicate wit the nodes.
+ *
+ * XXX At this point we have only a single connection to each node, and
+ * use multiplex it for multiple cursors (see BufferConnection).
*/
-static PGXCNodeHandle *co_handles = NULL;
+static PGXCNodeHandle *dn_handles = NULL; /* datanodes */
+static PGXCNodeHandle *co_handles = NULL; /* coordinators */
-/* Current size of dn_handles and co_handles */
+/* Current number of datanode and coordinator handles. */
int NumDataNodes;
int NumCoords;
-
-#ifdef XCP
volatile bool HandlesInvalidatePending = false;
volatile bool HandlesRefreshPending = false;
/*
- * Session and transaction parameters need to to be set on newly connected
- * remote nodes.
+ * Session/transaction parameters that need to to be set on new connections.
*/
static List *session_param_list = NIL;
static List *local_param_list = NIL;
-static StringInfo session_params;
+static StringInfo session_params;
static StringInfo local_params;
typedef struct
@@ -114,14 +200,9 @@ typedef struct
static bool DoInvalidateRemoteHandles(void);
static bool DoRefreshRemoteHandles(void);
-#endif
-#ifdef XCP
static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
bool global_session, int pid);
-#else
-static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
-#endif
static void pgxc_node_free(PGXCNodeHandle *handle);
static void pgxc_node_all_free(void);
@@ -130,7 +211,7 @@ static int get_char(PGXCNodeHandle * conn, char *out);
/*
- * Initialize PGXCNodeHandle struct
+ * Initialize empty PGXCNodeHandle struct
*/
static void
init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
@@ -165,17 +246,21 @@ init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
/*
- * Allocate and initialize memory to store Datanode and Coordinator handles.
+ * InitMultinodeExecutor
+ * Initialize datanode and coordinator handles.
+ *
+ * Acquires list of nodes from the node manager, and initializes handle
+ * for each one.
+ *
+ * Also determines PGXCNodeId to index in the proper array of handles
+ * (co_handles or dn_handles), depending on the type of this node.
*/
void
InitMultinodeExecutor(bool is_force)
{
int count;
Oid *coOids, *dnOids;
-#ifdef XCP
MemoryContext oldcontext;
-#endif
-
/* Free all the existing information first */
if (is_force)
@@ -192,13 +277,11 @@ InitMultinodeExecutor(bool is_force)
/* Get classified list of node Oids */
PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
-#ifdef XCP
/*
* Coordinator and datanode handles should be available during all the
* session lifetime
*/
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
-#endif
/* Do proper initialization of handles */
if (NumDataNodes > 0)
@@ -244,6 +327,15 @@ InitMultinodeExecutor(bool is_force)
MemoryContextSwitchTo(oldcontext);
+ /*
+ * Determine index of a handle representing this node, either in the
+ * coordinator or datanode handles, depending on the type of this
+ * node. The index gets stored in PGXCNodeId.
+ *
+ * XXX It's a bit confusing that this may point either to co_handles
+ * or dn_handles, and may easily lead to bugs when used with the
+ * incorrect array.
+ */
if (IS_PGXC_COORDINATOR)
{
for (count = 0; count < NumCoords; count++)
@@ -265,7 +357,13 @@ InitMultinodeExecutor(bool is_force)
}
/*
- * Builds up a connection string
+ * PGXCNodeConnStr
+ * Builds a connection string for the provided connection parameters.
+ *
+ * Aside from the usual connection parameters (host, port, ...) we also
+ * pass information about type of the parent node and remote node type.
+ *
+ * XXX Shouldn't this rather throw an ERROR instead of returning NULL?
*/
char *
PGXCNodeConnStr(char *host, int port, char *dbname,
@@ -278,6 +376,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
/*
* Build up connection string
* remote type can be Coordinator, Datanode or application.
+ *
+ * XXX What's application remote type?
*/
num = snprintf(connstr, sizeof(connstr),
"host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
@@ -299,7 +399,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
/*
- * Connect to a Datanode using a connection string
+ * PGXCNodeConnect
+ * Connect to a Datanode using a constructed connection string.
*/
NODE_CONNECTION *
PGXCNodeConnect(char *connstr)
@@ -311,7 +412,12 @@ PGXCNodeConnect(char *connstr)
return (NODE_CONNECTION *) conn;
}
-int PGXCNodePing(const char *connstr)
+/*
+ * PGXCNodePing
+ * Check that a node (identified the connstring) responds correctly.
+ */
+int
+PGXCNodePing(const char *connstr)
{
if (connstr[0])
{
@@ -326,22 +432,23 @@ int PGXCNodePing(const char *connstr)
}
/*
- * Close specified connection
+ * PGXCNodeClose
+ * Close connection connection.
*/
void
PGXCNodeClose(NODE_CONNECTION *conn)
{
- /* Delegate call to the pglib */
+ /* Delegate call to the libpq */
PQfinish((PGconn *) conn);
}
/*
- * Checks if connection active
+ * PGXCNodeConnected
+ * Check if the provided connection is open and valid.
*/
int
PGXCNodeConnected(NODE_CONNECTION *conn)
{
- /* Delegate call to the pglib */
PGconn *pgconn = (PGconn *) conn;
/*
@@ -352,12 +459,13 @@ PGXCNodeConnected(NODE_CONNECTION *conn)
}
-
-/* Close the socket handle (this process' copy) and free occupied memory
+/*
+ * pgxc_node_free
+ * Close the socket handle (local copy) and free occupied memory.
*
- * Note that we do not free the handle and its members. This will be
- * taken care of when the transaction ends, when TopTransactionContext
- * is destroyed in xact.c.
+ * Note that this only closes the socket, but we do not free the handle
+ * and its members. This will be taken care of when the transaction ends,
+ * when TopTransactionContext is destroyed in xact.c.
*/
static void
pgxc_node_free(PGXCNodeHandle *handle)
@@ -368,7 +476,8 @@ pgxc_node_free(PGXCNodeHandle *handle)
}
/*
- * Free all the node handles cached
+ * pgxc_node_all_free
+ * Free all the node handles cached in TopMemoryContext.
*/
static void
pgxc_node_all_free(void)
@@ -410,9 +519,11 @@ pgxc_node_all_free(void)
}
/*
- * Create and initialise internal structure to communicate to
- * Datanode via supplied socket descriptor.
- * Structure stores state info and I/O buffers
+ * pgxc_node_init
+ * Initialize the handle to communicate to node throught the socket.
+ *
+ * Stored PID of the remote backend, and of requested, sends the global
+ * session string to the remote node.
*/
static void
pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
@@ -435,9 +546,10 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
handle->inEnd = 0;
handle->inCursor = 0;
handle->needSync = false;
+
/*
* We got a new connection, set on the remote node the session parameters
- * if defined. The transaction parameter should be sent after BEGIN
+ * if defined. The transaction parameter should be sent after BEGIN.
*/
if (global_session)
{
@@ -451,8 +563,9 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
/*
- * Wait while at least one of specified connections has data available and read
- * the data into the buffer
+ * pgxc_node_receive
+ * Wait while at least one of the connections has data available, and
+ * read the data into the buffer.
*/
bool
pgxc_node_receive(const int conn_count,
@@ -605,28 +718,10 @@ retry:
return NO_ERROR_OCCURED;
}
-/*
- * Is there any data enqueued in the TCP input buffer waiting
- * to be read sent by the PGXC node connection
- */
-
-int
-pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
-{
- int ret;
- int enqueued;
-
- if (conn->sock < 0)
- return 0;
- ret = ioctl(conn->sock, FIONREAD, &enqueued);
- if (ret != 0)
- return 0;
-
- return enqueued;
-}
/*
- * Read up incoming messages from the PGXC node connection
+ * pgxc_node_read_data
+ * Read incoming data from the node TCP connection.
*/
int
pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
@@ -769,7 +864,10 @@ retry:
/*
- * Get one character from the connection buffer and advance cursor
+ * Get one character from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
*/
static int
get_char(PGXCNodeHandle * conn, char *out)
@@ -783,7 +881,12 @@ get_char(PGXCNodeHandle * conn, char *out)
}
/*
- * Read an integer from the connection buffer and advance cursor
+ * Try reading an integer from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
+ *
+ * XXX We only ever call this once with len=4, so simplify the function.
*/
static int
get_int(PGXCNodeHandle *conn, size_t len, int *out)
@@ -791,6 +894,10 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
unsigned short tmp2;
unsigned int tmp4;
+ /*
+ * XXX This seems somewhat inconsistent with get_char(). Perhaps this
+ * should use >= to behave in the same way?
+ */
if (conn->inCursor + len > conn->inEnd)
return EOF;
@@ -817,49 +924,70 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
/*
* get_message
- * If connection has enough data read entire message from the connection buffer
- * and returns message type. Message data and data length are returned as
- * var parameters.
- * If buffer does not have enough data leaves cursor unchanged, changes
- * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
- * receive more and returns \0
+ * Attempt to read the whole message from the input buffer, if possible.
+ *
+ * If the entire message is in the input buffer of the connection, reads it
+ * into a buffer (len and msg parameters) and returns the message type.
+ *
+ * If the input buffer does not contain the whole message, the cursor is
+ * left unchanged, the connection status is se to DN_CONNECTION_STATE_QUERY
+ * indicating it needs to receive more data, and \0 is returned (instead of
+ * an actual message type).
+ *
* conn - connection to read from
* len - returned length of the data where msg is pointing to
- * msg - returns pointer to memory in the incoming buffer. The buffer probably
- * will be overwritten upon next receive, so if caller wants to refer it later
- * it should make a copy.
+ * msg - returns pointer to position in the incoming buffer
+ *
+ * The buffer probably will be overwritten upon next receive, so if caller
+ * wants to refer it later it should make a copy.
*/
char
get_message(PGXCNodeHandle *conn, int *len, char **msg)
{
char msgtype;
+ /*
+ * Try reading the first char (message type) and integer (message length).
+ *
+ * Both functions return 0 (false) in case of success, and EOF (true) in
+ * case of failure. So we call get_char() first, and only if it succeeds
+ * the get_int() gets called.
+ */
if (get_char(conn, &msgtype) || get_int(conn, 4, len))
{
- /* Successful get_char would move cursor, restore position */
+ /* Successful get_char/get_int would move cursor, restore position. */
conn->inCursor = conn->inStart;
return '\0';
}
+ /* The message length includes the length header too, so subtract it. */
*len -= 4;
+ /*
+ * If the whole message is not in the buffer, we need to read more data.
+ *
+ * Reading function will discard already consumed data in the buffer till
+ * conn->inCursor. To avoid extra/handle cycles we need to fit the whole
+ * message (and not just a part of it) into the buffer. So let's ensure
+ * the buffer is large enough.
+ *
+ * We need 1 byte for for message type, 4 bytes for message length and
+ * the message itself (the length is currently in *len). The buffer may
+ * already be large enough, in which case ensure_in_buffer_capacity()
+ * will return immediately .
+ */
if (conn->inCursor + *len > conn->inEnd)
{
- /*
- * Not enough data in the buffer, we should read more.
- * Reading function will discard already consumed data in the buffer
- * till conn->inBegin. Then we want the message that is partly in the
- * buffer now has been read completely, to avoid extra read/handle
- * cycles. The space needed is 1 byte for message type, 4 bytes for
- * message length and message itself which size is currently in *len.
- * The buffer may already be large enough, in this case the function
- * ensure_in_buffer_capacity() will immediately return
+ /* ensure space for the whole message (including 5B header)
+ *
+ * FIXME Add check of the return value. Non-zero value means failure.
*/
ensure_in_buffer_capacity(5 + (size_t) *len, conn);
conn->inCursor = conn->inStart;
return '\0';
}
+ /* Great, the whole message in the buffer. */
*msg = conn->inBuffer + conn->inCursor;
conn->inCursor += *len;
conn->inStart = conn->inCursor;
@@ -868,8 +996,8 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg)
/*
- * Release all Datanode and Coordinator connections
- * back to pool and release occupied memory
+ * release_handles
+ * Release all node connections back to pool and free the memory.
*/
void
release_handles(void)
@@ -887,6 +1015,7 @@ release_handles(void)
if (cluster_ex_lock_held)
return;
+ /* quick exit if we have no connections to release */
if (datanode_count == 0 && coord_count == 0)
return;
@@ -917,9 +1046,14 @@ release_handles(void)
}
}
+ /*
+ * XXX Not sure why we coordinator connections are only released when on
+ * a coordinator. Perhaps we never acquire connections to coordinators on
+ * datanodes? Seems like a rather minor optimization anyway.
+ */
if (IS_PGXC_COORDINATOR)
{
- /* Collect Coordinator handles */
+ /* Free Coordinator handles */
for (i = 0; i < NumCoords; i++)
{
PGXCNodeHandle *handle = &co_handles[i];
@@ -943,7 +1077,10 @@ release_handles(void)
}
}
- /* And finally release all the connections on pooler */
+ /*
+ * And finally release all the connections held by this backend back
+ * to the connection pool.
+ */
PoolManagerReleaseConnections(destroy);
datanode_count = 0;
@@ -951,15 +1088,20 @@ release_handles(void)
}
/*
- * Ensure that the supplied buffer has enough capacity and if not, it's
- * extended to an appropriate size.
+ * ensure_buffer_capacity
+ * Ensure that the supplied buffer has at least the required capacity.
+ *
+ * currbuf - the currently allocated buffer
+ * currsize - size of the current buffer (in bytes)
+ * bytes_needed - required capacity (in bytes)
+ *
+ * We shall return the new buffer, if allocated successfully and set newsize_p
+ * to contain the size of the repalloc-ed buffer.
*
- * currbuf is the currently used buffer of currsize. bytes_needed is the
- * minimum size required. We shall return the new buffer, if allocated
- * successfully and set newsize_p to contain the size of the repalloced buffer.
* If allocation fails, NULL is returned.
*
- * The function checks for requests beyond MaxAllocSize and throw an error.
+ * The function checks for requests beyond MaxAllocSize and throws an error
+ * if the request exceeds the limit.
*/
static char *
ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
@@ -967,6 +1109,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
char *newbuf;
Size newsize = (Size) currsize;
+ /* XXX Perhaps use AllocSizeIsValid instead? */
if (((Size) bytes_needed) >= MaxAllocSize)
ereport(ERROR,
(ENOSPC,
@@ -974,6 +1117,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
currsize, bytes_needed)));
+ /* if the buffer is already large enough, we're done */
if (bytes_needed <= newsize)
{
*newsize_p = currsize;
@@ -1028,8 +1172,10 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
}
/*
- * Ensure specified amount of data can fit to the incoming buffer and
- * increase it if necessary
+ * ensure_in_buffer_capacity
+ * Ensure specified amount of data can fit to the input buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
*/
int
ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1047,8 +1193,10 @@ ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
}
/*
- * Ensure specified amount of data can fit to the outgoing buffer and
- * increase it if necessary
+ * ensure_out_buffer_capacity
+ * Ensure specified amount of data can fit to the output buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
*/
int
ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1067,7 +1215,8 @@ ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
/*
- * Send specified amount of data from the outgoing buffer over the connection
+ * send_some
+ * Send specified amount of data from the output buffer over the handle.
*/
int
send_some(PGXCNodeHandle *handle, int len)
@@ -1195,11 +1344,12 @@ send_some(PGXCNodeHandle *handle, int len)
}
/*
- * Send PARSE message with specified statement down to the Datanode
+ * pgxc_node_send_parse
+ * Send PARSE message with specified statement down to the datanode.
*/
int
pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
- const char *query, short num_params, Oid *param_types)
+ const char *query, short num_params, Oid *param_types)
{
/* statement name size (allow NULL) */
int stmtLen = statement ? strlen(statement) + 1 : 1;
@@ -1283,7 +1433,8 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
}
/*
- * Send PLAN message down to the Data node
+ * pgxc_node_send_plan
+ * Send PLAN message down to the datanode.
*/
int
pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
@@ -1364,7 +1515,8 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
}
/*
- * Send BIND message down to the Datanode
+ * pgxc_node_send_bind
+ * Send BIND message down to the datanode.
*/
int
pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
@@ -1446,7 +1598,8 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
/*
- * Send DESCRIBE message (portal or statement) down to the Datanode
+ * pgxc_node_send_describe
+ * Send DESCRIBE message (portal or statement) down to the datanode.
*/
int
pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
@@ -1494,7 +1647,8 @@ pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
/*
- * Send CLOSE message (portal or statement) down to the Datanode
+ * pgxc_node_send_close
+ * Send CLOSE message (portal or statement) down to the datanode.
*/
int
pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
@@ -1534,7 +1688,8 @@ pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
}
/*
- * Send EXECUTE message down to the Datanode
+ * pgxc_node_send_execute
+ * Send EXECUTE message down to the datanode.
*/
int
pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
@@ -1579,7 +1734,8 @@ pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
/*
- * Send FLUSH message down to the Datanode
+ * pgxc_node_send_flush
+ * Send FLUSH message down to the datanode.
*/
int
pgxc_node_send_flush(PGXCNodeHandle * handle)
@@ -1606,7 +1762,8 @@ pgxc_node_send_flush(PGXCNodeHandle * handle)
/*
- * Send SYNC message down to the Datanode
+ * pgxc_node_send_sync
+ * Send SYNC message down to the datanode.
*/
int
pgxc_node_send_sync(PGXCNodeHandle * handle)
@@ -1635,7 +1792,8 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
/*
- * Send series of Extended Query protocol messages to the data node
+ * pgxc_node_send_query_extended
+ * Send series of Extended Query protocol messages to the datanode.
*/
int
pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
@@ -1664,8 +1822,11 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
/*
- * This method won't return until connection buffer is empty or error occurs
- * To ensure all data are on the wire before waiting for response
+ * pgxc_node_flush
+ * Flush all data from the output buffer of a node handle.
+ *
+ * This method won't return until connection buffer is empty or error occurs.
+ * To ensure all data are on the wire before waiting for a response.
*/
int
pgxc_node_flush(PGXCNodeHandle *handle)
@@ -1693,39 +1854,10 @@ pgxc_node_flush(PGXCNodeHandle *handle)
return 0;
}
-/*
- * This method won't return until network buffer is empty or error occurs
- * To ensure all data in network buffers is read and wasted
- */
-void
-pgxc_node_flush_read(PGXCNodeHandle *handle)
-{
- bool is_ready;
- int read_result;
-
- if (handle == NULL)
- return;
-
- /*
- * Before reading input send Sync to make sure
- * we will eventually receive ReadyForQuery
- */
- pgxc_node_send_sync(handle);
- while(true)
- {
- read_result = pgxc_node_read_data(handle, false);
- if (read_result < 0)
- break;
-
- is_ready = is_data_node_ready(handle);
- if (is_ready == true)
- break;
-
- }
-}
/*
- * Send specified statement down to the PGXC node
+ * pgxc_node_send_query_internal
+ * Send the statement down to the PGXC node.
*/
static int
pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
@@ -1768,21 +1900,32 @@ pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
return pgxc_node_flush(handle);
}
+/*
+ * pgxc_node_send_rollback
+ * Send the rollback command to the remote node.
+ *
+ * XXX The only effect of the "rollback" is that we try sending the query
+ * even on invalid/failed connections (when everything else is prohibited).
+ */
int
pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
{
return pgxc_node_send_query_internal(handle, query, true);
}
+/*
+ * pgxc_node_send_query
+ * Send the query to the remote node.
+ */
int
pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
{
return pgxc_node_send_query_internal(handle, query, false);
}
-
/*
- * Send the GXID down to the PGXC node
+ * pgxc_node_send_gxid
+ * Send the GXID (global transaction ID) down to the remote node.
*/
int
pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
@@ -1812,7 +1955,8 @@ pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
}
/*
- * Send the Command ID down to the PGXC node
+ * pgxc_node_send_cmd_id
+ * Send the Command ID down to the remote node
*/
int
pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
@@ -1847,7 +1991,8 @@ pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
}
/*
- * Send the snapshot down to the PGXC node
+ * pgxc_node_send_snapshot
+ * Send the snapshot down to the remote node.
*/
int
pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
@@ -1901,7 +2046,8 @@ pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
}
/*
- * Send the timestamp down to the PGXC node
+ * pgxc_node_send_timestamp
+ * Send the timestamp down to the remote node
*/
int
pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
@@ -1947,8 +2093,9 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
/*
- * Add another message to the list of errors to be returned back to the client
- * at the convenient time
+ * add_error_message
+ * Add a message to the list of errors to be returned back to the client
+ * at a convenient time.
*/
void
add_error_message(PGXCNodeHandle *handle, const char *message)
@@ -1964,11 +2111,17 @@ add_error_message(PGXCNodeHandle *handle, const char *message)
handle->error = pstrdup(message);
}
+/* index of the last node returned by get_any_handled (round-robin) */
static int load_balancer = 0;
+
/*
- * Get one of the specified nodes to query replicated data source.
- * If session already owns one or more of the requested connection,
- * the function returns existing one to avoid contacting pooler.
+ * get_any_handle
+ * Get one of the specified nodes to query replicated data source.
+ *
+ * If session already owns one or more of requested datanode connections,
+ * the function returns one of those existing ones to avoid unnecessary
+ * pooler requests.
+ *
* Performs basic load balancing.
*/
PGXCNodeHandle *
@@ -1998,6 +2151,7 @@ get_any_handle(List *datanodelist)
/* At the moment node is an index in the array, and we may need to wrap it */
if (node >= NumDataNodes)
node -= NumDataNodes;
+
/* See if handle is already used */
if (dn_handles[node].sock != NO_SOCKET)
{
@@ -2079,13 +2233,16 @@ get_any_handle(List *datanodelist)
}
/*
- * for specified list return array of PGXCNodeHandles
- * acquire from pool if needed.
- * the lenth of returned array is the same as of nodelist
- * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
- * The returned list should be pfree'd when no longer needed.
- * For Coordinator, do not get a connection if Coordinator list is NIL,
- * Coordinator fds is returned only if transaction uses a DDL
+ * get_handles
+ * Return array of node handles (PGXCNodeHandles) for requested nodes.
+ *
+ * If we don't have the handles in the pool, acquire from pool if needed.
+ *
+ * For datanodes, the specified list may be set to NIL, in which case we
+ * return handles for all datanodes.
+ *
+ * For coordinators, we do not acquire any handles when NIL list is used.
+ * Coordinator handles are needed only for transaction performing DDL.
*/
PGXCNodeAllHandles *
get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
@@ -2360,6 +2517,10 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
return result;
}
+/*
+ * get_current_handles
+ * Return currently acquired handles.
+ */
PGXCNodeAllHandles *
get_current_handles(void)
{
@@ -2414,7 +2575,10 @@ get_current_handles(void)
return result;
}
-/* Free PGXCNodeAllHandles structure */
+/*
+ * pfree_pgxc_all_handles
+ * Free memory allocated for the PGXCNodeAllHandles structure.
+ */
void
pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
{
@@ -2433,11 +2597,14 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
/*
* PGXCNodeGetNodeId
- * Look at the data cached for handles and return node position
- * If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
- * if node type is PGXC_NODE_DATANODE look only in datanode list,
- * if other (assume PGXC_NODE_NODE) search both, in last case return actual
- * node type.
+ * Lookup index of the requested node (by OID) in the cached handles.
+ *
+ * Optionally, the node type may be restricted using the second parameter.
+ * If the type is PGXC_NODE_COORDINATOR, we only look in coordinator list.
+ * If the node is PGXC_NODE_DATANODE, we only look in datanode list.
+ *
+ * For other values (assume PGXC_NODE_NONE) we search for both node types,
+ * and then also return the actual node type in the second parameter.
*/
int
PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
@@ -2478,7 +2645,9 @@ PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
/*
* PGXCNodeGetNodeOid
- * Look at the data cached for handles and return node Oid
+ * Look at the data cached for handles and return node Oid.
+ *
+ * XXX Unlike PGXCNodeGetNodeId, this requires node type parameter.
*/
Oid
PGXCNodeGetNodeOid(int nodeid, char node_type)
@@ -2504,8 +2673,7 @@ PGXCNodeGetNodeOid(int nodeid, char node_type)
/*
* pgxc_node_str
- *
- * get the name of the node
+ * get the name of the current node
*/
Datum
pgxc_node_str(PG_FUNCTION_ARGS)
@@ -2515,7 +2683,7 @@ pgxc_node_str(PG_FUNCTION_ARGS)
/*
* PGXCNodeGetNodeIdFromName
- * Return node position in handles array
+ * Return position of the node (specified by name) in handles array.
*/
int
PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
@@ -2544,42 +2712,48 @@ PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
return PGXCNodeGetNodeId(nodeoid, node_type);
}
+/*
+ * paramlist_delete_param
+ * Delete parameter with the specified name from the parameter list.
+ */
static List *
paramlist_delete_param(List *param_list, const char *name)
{
- ListCell *cur_item;
- ListCell *prev_item;
-
- prev_item = NULL;
- cur_item = list_head(param_list);
-
- while (cur_item != NULL)
- {
- ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
-
- if (strcmp(NameStr(entry->name), name) == 0)
- {
- /* cur_item must be removed */
- param_list = list_delete_cell(param_list, cur_item, prev_item);
- pfree(entry);
- if (prev_item)
- cur_item = lnext(prev_item);
- else
- cur_item = list_head(param_list);
- }
- else
- {
- prev_item = cur_item;
- cur_item = lnext(prev_item);
- }
- }
-
- return param_list;
+ ListCell *cur_item;
+ ListCell *prev_item;
+
+ prev_item = NULL;
+ cur_item = list_head(param_list);
+
+ while (cur_item != NULL)
+ {
+ ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
+
+ if (strcmp(NameStr(entry->name), name) == 0)
+ {
+ /* cur_item must be removed */
+ param_list = list_delete_cell(param_list, cur_item, prev_item);
+ pfree(entry);
+ if (prev_item)
+ cur_item = lnext(prev_item);
+ else
+ cur_item = list_head(param_list);
+ }
+ else
+ {
+ prev_item = cur_item;
+ cur_item = lnext(prev_item);
+ }
+ }
+
+ return param_list;
}
/*
- * Remember new value of a session or transaction parameter, and set same
- * values on newly connected remote nodes.
+ * PGXCNodeSetParam
+ * Remember new value of a session/transaction parameter.
+ *
+ * We'll set this parameter value for new connections to remote nodes.
*/
void
PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
@@ -2617,8 +2791,9 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
/*
* Special case for
- * RESET SESSION AUTHORIZATION
- * SET SESSION AUTHORIZATION TO DEFAULT
+ *
+ * RESET SESSION AUTHORIZATION
+ * SET SESSION AUTHORIZATION TO DEFAULT
*
* We must also forget any SET ROLE commands since RESET SESSION
* AUTHORIZATION also resets current role to session default
@@ -2636,8 +2811,8 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
/*
- * Forget all parameter values set either for transaction or both transaction
- * and session.
+ * PGXCNodeResetParams
+ * Forget all transaction (or session too) parameters.
*/
void
PGXCNodeResetParams(bool only_local)
@@ -2662,6 +2837,10 @@ PGXCNodeResetParams(bool only_local)
local_params = NULL;
}
+/*
+ * get_set_command
+ * Construct a command setting all parameters from a given list.
+ */
static void
get_set_command(List *param_list, StringInfo command, bool local)
{
@@ -2687,22 +2866,29 @@ get_set_command(List *param_list, StringInfo command, bool local)
/*
- * Returns SET commands needed to initialize remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
- * To support Distributed Session machinery coordinator should generate and
- * send a distributed session identifier to remote nodes. Generate it here.
+ * PGXCNodeGetSessionParamStr
+ * Returns SET commands needed to initialize remote session.
+ *
+ * The SET command may already be built and valid (in the session_params),
+ * in which case we simply return it. Otherwise we build if from session
+ * parameter list.
+ *
+ * To support "Distributed Session" machinery, the coordinator should
+ * generate and send a distributed session identifier to remote nodes.
+ * Generate it here (simply as nodename_PID).
+ *
+ * We always define a parameter with PID of the parent process (which is
+ * this backend).
*/
char *
PGXCNodeGetSessionParamStr(void)
{
/*
- * If no session parameters are set and that is a coordinator we need to set
- * global_session anyway, even if there were no other parameters.
- * We do not want this string to disappear, so create it in the
- * TopMemoryContext. However if we add first session parameter we will need
- * to free the buffer and recreate it in the same context as the hash table
- * to avoid memory leakage.
+ * If no session parameters are set and this is a coordinator node, we
+ * need to set global_session anyway, even if there are no other params.
+ *
+ * We do not want this string to simply disappear, so create it in the
+ * TopMemoryContext.
*/
if (session_params == NULL)
{
@@ -2711,7 +2897,7 @@ PGXCNodeGetSessionParamStr(void)
MemoryContextSwitchTo(oldcontext);
}
- /* If the paramstr invalid build it up */
+ /* If the parameter string is empty, build it up. */
if (session_params->len == 0)
{
if (IS_PGXC_COORDINATOR)
@@ -2726,9 +2912,11 @@ PGXCNodeGetSessionParamStr(void)
/*
- * Returns SET commands needed to initialize transaction on a remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
+ * PGXCNodeGetTransactionParamStr
+ * Returns SET commands needed to initialize transaction on a remote node.
+ *
+ * The command may already be built and valid (in local_params StringInfo), in
+ * which case we return it right away. Otherwise build it up.
*/
char *
PGXCNodeGetTransactionParamStr(void)
@@ -2738,7 +2926,7 @@ PGXCNodeGetTransactionParamStr(void)
return NULL;
/*
- * If the paramstr invalid build it up.
+ * If the StringInfo is not allocated yed, do it in TopTransactionContext.
*/
if (local_params == NULL)
{
@@ -2746,25 +2934,30 @@ PGXCNodeGetTransactionParamStr(void)
local_params = makeStringInfo();
MemoryContextSwitchTo(oldcontext);
}
+
/*
- * If parameter string exists it is valid, it is truncated when parameters
- * are modified.
+ * If the parameter string is empty, it was reset in PGXCNodeSetParam. So
+ * recompute it, using the current local_param_list (we know it's not
+ * empty, otherwise we wound't get here through the first condition).
*/
if (local_params->len == 0)
{
get_set_command(local_param_list, local_params, true);
}
+
return local_params->len == 0 ? NULL : local_params->data;
}
/*
- * Send down specified query, read and discard all responses until ReadyForQuery
+ * pgxc_node_set_query
+ * Send down specified query, discard all responses until ReadyForQuery.
*/
void
pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
{
pgxc_node_send_query(handle, set_query);
+
/*
* Now read responses until ReadyForQuery.
* XXX We may need to handle possible errors here.
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 140907d872..3722e9e04d 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -2,34 +2,217 @@
*
* poolmgr.c
*
- * Connection pool manager handles connections to Datanodes
+ * Connection pool manager handles connections to other nodes.
*
- * The pooler runs as a separate process and is forked off from a
- * Coordinator postmaster. If the Coordinator needs a connection from a
- * Datanode, it asks for one from the pooler, which maintains separate
- * pools for each Datanode. A group of connections can be requested in
- * a single request, and the pooler returns a list of file descriptors
- * to use for the connections.
*
- * Note the current implementation does not yet shrink the pool over time
- * as connections are idle. Also, it does not queue requests; if a
- * connection is unavailable, it will simply fail. This should be implemented
- * one day, although there is a chance for deadlocks. For now, limiting
- * connections should be done between the application and Coordinator.
- * Still, this is useful to avoid having to re-establish connections to the
- * Datanodes all the time for multiple Coordinator backend sessions.
+ * During query execution, nodes in the cluster often need communicate
+ * with other nodes. This applies both to coordinators (which generally
+ * delegate the query execution to the datanodes) and datanodes (that
+ * may need to exchange data with other datanodes, e.g. to redistribute
+ * one side of a join).
*
- * The term "agent" here refers to a session manager, one for each backend
- * Coordinator connection to the pooler. It will contain a list of connections
- * allocated to a session, at most one per Datanode.
+ * Opening a new connection every time would be very inefficient (and
+ * would quickly become a major bottleneck in OLTP workloads with short
+ * queries/transactions), so XL pools and reuses the connections.
*
+ * The pool manager runs as a separate auxiliary process and is forked
+ * from the postmaster in AuxiliaryProcessMain(), similarly to other
+ * auxiliary processes (checkpointer, bgwriter, ...).
+ *
+ * When a backend needs a connection to another node, it does not open
+ * it on it's own, but instead asks the pool manager. The pool manager
+ * maintains lists of connections for other nodes, so in most cases it
+ * can quickly provide an existing connection.
+ *
+ * Backends often need multiple connections at the same time (unless the
+ * query gets pushed to just a single node), so to reduce the overhead
+ * it's also possible to request multiple connections at once. In that
+ * case the pool manager handles all of them at once, and returns file
+ * descriptors for all the nodes at once.
+ *
+ *
+ * Note: The connection requests are not queued; if a connection is not
+ * unavailable (and can't be opened right away), the request will simply
+ * fail. This should be implemented one day, although there is a chance
+ * for deadlocks. For now, limiting connections should be done between
+ * the application and the coordinator. Still, this is useful to avoid
+ * having to re-establish connections to the datanodes all the time for
+ * multiple coordinator backend sessions.
+ *
+ * XXX Well, we try to do pools_maintenance(), which closes all old idle
+ * connections. But we try to do that only once, to prevent infinite
+ * loops.
+ *
+ * The term "pool agent" here refers to a session manager, one for each
+ * backend accessing the pooler. It manages a list of connections
+ * allocated to a session, at most one per datanode.
+ *
+ *
+ * entities of the pooler
+ * ======================
+ *
+ * This section is an overview of basic entities in the connection pool
+ * implementation. With the exception of PoolManager, all the entities
+ * are represented by a struct.
+ *
+ *
+ * PoolManager
+ * -----------
+ *
+ * - The auxiliary process started by postmaster, managing all requests
+ * from sessions (from backend processes).
+ *
+ * - Requests arrive through PoolHandle (from sessions) and responses
+ * (back to sessions) are sent through PoolAgent.
+ *
+ * PoolHandle
+ * ----------
+ *
+ * - Connection to PoolManager from sessions, i.e. when the sessions
+ * needs something from the pool manager (e.g. new connection), it
+ * sends a request a request through the handle (which pretty much
+ * represents a unix socket).
+ *
+ * - Created and initialized in the backend process.
+ *
+ * PoolAgent
+ * ---------
+ *
+ * - Represents a session in the connection pool manager process, and
+ * associates it with a database pool.
+ *
+ * - Tracks open connections to other nodes in the cluster, so that
+ * we can release or close them automatically if needed.
+ *
+ * DatabasePool
+ * ------------
+ *
+ * - A connection pool for a particular database/user combination, or
+ * rather a collection of per-node connection pools, one for each
+ * node in the cluster.
+ *
+ * PGXCNodePool
+ * ------------
+ *
+ * - A pool of connections for a particular node in the cluster, part
+ * of a DatabasePool (i.e. for a database/user combination).
+ *
+ * PGXCNodePoolSlot
+ * ----------------
+ *
+ * - A pooled connection, tracked in PGXCNodePool.
+ *
+ *
+ * interaction with the pooler
+ * ===========================
+ *
+ * When a session needs to open connections to other nodes, this is very
+ * roughly what happens:
+ *
+ * 1) PoolManagerConnect (backend session)
+ *
+ * Initializes connection to the pool manager process (through the
+ * unix socket), so that the session can send messages to the pool.
+ * The connection is represented by "pool handle".
+ *
+ * Note: This is not called directly, but automatically from the
+ * functions that require connection to connection pool.
+ *
+ * 2) agent_create/agent_init (pool manager)
+ *
+ * Accepts the connection from the session, and opens a socket used
+ * to respond to the session (e.g. with pooled connections).
+ *
+ * Initializes the PoolAgent responsible for managing the pooled
+ * connections assigned to this session, and associates it with
+ * a database pool (dbname/user combination).
+ *
+ * 3) PoolManagerGetConnections (backend session)
+ *
+ * Sends a request to the pool manager (through the pool handle).
+ * The pool manager handles this in handle_get_connections(), and
+ * sends back a list of file descriptors (pooled connections).
+ *
+ * 4) PoolManagerReleaseConnections (backend session)
+ *
+ * Sends a request to the pool manager, notifying it that the
+ * connections can be returned to the shared connection pool (or
+ * have to be closed, in case of error).
+ *
+ * The pool manager handles this in agent_release_connections().
+ *
+ * 5) PoolManagerDisconnect (backend session)
+ *
+ * Sends a 'disconnect' message to the pool manager, and resets
+ * the pool handle to NULL (if the session needs more connections,
+ * it'll reconnect and start from scratch).
+ *
+ * The pool manager handles the message by calling agent_destroy(),
+ * which releases all remaining connections associated with the
+ * agent, and then releases all the memory.
+ *
+ *
+ * public connection pool API
+ * ==========================
+ *
+ * The previous section briefly discussed the simplest interaction with
+ * the pool manager. This section provides a more complete overview of
+ * the pooler API, with some additional functions.
+ *
+ * These functions are meant to be used from the backends, and mostly
+ * "only" send requests to the pool manager (through the socket). The
+ * pool manager then processes those requests and does all the work.
+ *
+ * The primary use case (pooling) is handled by two functions:
+ *
+ * - PoolManagerGetConnections acquire connection from the pool
+ * - PoolManagerReleaseConnections release pooled connections back
+ *
+ * To cancel a query or abort a transaction in a distributed database,
+ * we need to forward the cancel/abort requests to all participating
+ * connection (tracked by PoolAgent). This is done by:
+ *
+ * - PoolManagerCancelQuery forward "query cancel"
+ * - PoolManagerAbortTransactions forward "abort transaction"
+ *
+ * The API also includes a number of 'maintenance' functions, which are
+ * useful e.g. when changing configuration of the cluster.
+ *
+ * - PoolManagerCleanConnection close all unused connections
+ * - PoolManagerCheckConnectionInfo check connection consistency
+ * - PoolManagerRefreshConnectionInfo close mismatching connections
+ * - PoolManagerReloadConnectionInfo close all connections
+ *
+ * There's a number of additional helper functions, but those are mostly
+ * internal and marked as static.
+ *
+ *
+ * XXX Why do we even need a separate connection pool manager? Can't we
+ * simply track the connections in a shared memory, somehow? That should
+ * be fairly simple, and it would remove the need for a separate process
+ * managing requests from all backends, no?
+ *
+ * XXX Apparently there's no "max_db_connections" option, that would
+ * limit the number of connections per node (similarly to what pgbouncer
+ * does for each DB pool, by grouping all per-user connections).
+ *
+ * XXX Make POOL_CHECK_SUCCESS and POOL_CHECK_FAILED an enum.
+ *
+ * XXX Some of the functions expect two separate lists of nodes, one for
+ * datanodes and one for coordinators. Not sure why that is necessary,
+ * and it makes the code more complicated.
+ *
+ * XXX The message types are hard-coded in the various methods as magic
+ * constants (e.g. PoolManagerAbortTransactions uses 'a'). Perhaps
+ * define this somewhere in a clear manner, e.g. like a #define.
*
* Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
+ *
* IDENTIFICATION
- * $$
+ * src/backend/pgxc/pool/poolmgr.c
*
*-------------------------------------------------------------------------
*/
@@ -75,8 +258,7 @@ int PoolConnKeepAlive = 600;
int PoolMaintenanceTimeout = 30;
int MaxPoolSize = 100;
int PoolerPort = 6667;
-
-bool PersistentConnections = false;
+bool PersistentConnections = false;
/* Flag to tell if we are Postgres-XC pooler process */
static bool am_pgxc_pooler = false;
@@ -89,35 +271,66 @@ typedef struct
int port;
} PGXCNodeConnectionInfo;
-/* Handle to the pool manager (Session's side) */
+/* Handle to the pool manager (from each session) */
typedef struct
{
/* communication channel */
PoolPort port;
} PoolHandle;
-/* The root memory context */
+/* The pooler root memory context */
static MemoryContext PoolerMemoryContext = NULL;
-/*
- * Allocations of core objects: Datanode connections, upper level structures,
- * connection strings, etc.
- */
+
+/* Core objects: connections, connection strings, etc. */
static MemoryContext PoolerCoreContext = NULL;
-/*
- * Memory to store Agents
- */
+
+/* Pool Agents */
static MemoryContext PoolerAgentContext = NULL;
-/* Pool to all the databases (linked list) */
+/*
+ * A list of connection pools per (one for each db/user combination).
+ *
+ * XXX The DatabasePool are organized in a simple linked list. That may
+ * be an issue with many databases/users, so perhaps we should consider
+ * organizing this in a hash table or something. But for now linked
+ * list is good enough.
+ */
static DatabasePool *databasePools = NULL;
-/* PoolAgents and the poll array*/
+/*
+ * An array of allocated PoolAgents (one for each session).
+ *
+ * There's a 1:1 mapping between sessions and agents, so the number of
+ * agents is limited by MaxConnections. Also, we can access the agents
+ * directly using MyBackendId, so there's not much point in building a
+ * more complicated structure here (like a hash table for example).
+ *
+ * XXX That however does not happen, because agent_create() simply adds
+ * the agents at the end of the poolAgents array. So PoolerLoop and
+ * agent_destroy have to loop through the agents, etc. Seems expensive.
+ *
+ * XXX We do know that there will never be more than MaxConnections
+ * agents, so we can simply pre-allocate all of them in PoolManagerInit,
+ * and then only flag them as 'used/unused' intead of palloc/pfree.
+ */
static int agentCount = 0;
static PoolAgent **poolAgents;
+/*
+ * A connection to the pool manager (essentially a PQ connection).
+ */
static PoolHandle *poolHandle = NULL;
+/*
+ * PoolManager "lock" flag. The manager runs as a separate process, so
+ * we can use this very simple approach to locking.
+ */
static int is_pool_locked = false;
+
+/*
+ * File descriptor representing the pool manager UNIX socket. Sessions
+ * are communicating with the pool manager though this file descriptor.
+ */
static int server_fd = -1;
static int node_info_check(PoolAgent *agent);
@@ -141,10 +354,18 @@ static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
static void agent_release_connections(PoolAgent *agent, bool force_destroy);
static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
Oid node, bool force_destroy);
+
static void destroy_slot(PGXCNodePoolSlot *slot);
-static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
static void destroy_node_pool(PGXCNodePool *node_pool);
+
+static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
+static bool shrink_pool(DatabasePool *pool);
+static void pools_maintenance(void);
+
static void PoolerLoop(void);
+static void PoolManagerConnect(const char *database, const char *user_name,
+ const char *pgoptions);
+
static int clean_connection(List *node_discard,
const char *database,
const char *user_name);
@@ -153,14 +374,12 @@ static int *abort_pids(int *count,
const char *database,
const char *user_name);
static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
+
/* Signal handlers */
static void pooler_die(SIGNAL_ARGS);
static void pooler_quickdie(SIGNAL_ARGS);
-static void PoolManagerConnect(const char *database, const char *user_name,
- const char *pgoptions);
static void pooler_sighup(SIGNAL_ARGS);
-static bool shrink_pool(DatabasePool *pool);
-static void pools_maintenance(void);
+
static void TryPingUnhealthyNode(Oid nodeoid);
/*
@@ -182,7 +401,7 @@ IsPGXCPoolerProcess(void)
}
/*
- * Initialize internal structures
+ * Initialize internal PoolManager structures.
*/
int
PoolManagerInit()
@@ -208,7 +427,8 @@ PoolManagerInit()
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
- ForgetLockFiles();
+ /* XXX Not sure what this is ... */
+ ForgetLockFiles();
/*
* Properly accept or ignore signals the postmaster might send us
@@ -230,6 +450,7 @@ PoolManagerInit()
/* Allocate pooler structures in the Pooler context */
MemoryContextSwitchTo(PoolerMemoryContext);
+ /* Allocate pool agents, one for each connection (session). */
poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
if (poolAgents == NULL)
{
@@ -244,7 +465,11 @@ PoolManagerInit()
/*
- * Check connection info consistency with system catalogs
+ * node_info_check
+ * Check that connection info is consistent with system catalogs.
+ *
+ * Returns POOL_CHECK_SUCCESS when all the information (number of nodes,
+ * node OIDs and connection strings) match. POOL_CHECK_FAILED otherwise.
*/
static int
node_info_check(PoolAgent *agent)
@@ -258,8 +483,9 @@ node_info_check(PoolAgent *agent)
int numDn;
/*
- * First check if agent's node information matches to current content of the
- * shared memory table.
+ * First check if agent's node information (number of node OIDs and
+ * the OID values) matches the current contents of the shared memory
+ * table (with authoritative node information).
*/
PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
@@ -274,8 +500,14 @@ node_info_check(PoolAgent *agent)
pfree(dnOids);
/*
- * Iterate over all dbnode pools and check if connection strings
- * are matching node definitions.
+ * Iterate over all database pools and check if connection strings
+ * (in all node pools) match node definitions from node catalog.
+ *
+ * XXX Does this behave correctly with multiple database pools? We
+ * remember which nodes were already checked in a 'checked' list,
+ * so that we check each node just once. But doesn't that mean we
+ * only really check the first DatabasePool and fail to check the
+ * following ones?
*/
while (res == POOL_CHECK_SUCCESS && dbPool)
{
@@ -287,22 +519,30 @@ node_info_check(PoolAgent *agent)
{
char *connstr_chk;
- /* No need to check same Datanode twice */
+ /* No need to check same node twice */
if (list_member_oid(checked, nodePool->nodeoid))
continue;
+
checked = lappend_oid(checked, nodePool->nodeoid);
connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
if (connstr_chk == NULL)
{
/* Problem of constructing connection string */
+ ereport(INFO,
+ (errmsg("failed to construct connection string for node %d",
+ nodePool->nodeoid)));
hash_seq_term(&hseq_status);
res = POOL_CHECK_FAILED;
break;
}
+
/* return error if there is difference */
if (strcmp(connstr_chk, nodePool->connstr))
{
+ ereport(INFO,
+ (errmsg("mismatching connection string for node %d ('%s' != '%s')",
+ nodePool->nodeoid, nodePool->connstr, connstr_chk)));
pfree(connstr_chk);
hash_seq_term(&hseq_status);
res = POOL_CHECK_FAILED;
@@ -313,29 +553,21 @@ node_info_check(PoolAgent *agent)
}
dbPool = dbPool->next;
}
+
list_free(checked);
return res;
}
/*
- * Destroy internal structures
- */
-int
-PoolManagerDestroy(void)
-{
- int status = 0;
-
- if (PoolerMemoryContext)
- {
- MemoryContextDelete(PoolerMemoryContext);
- PoolerMemoryContext = NULL;
- }
-
- return status;
-}
-
-/*
- * Connect to the pooler process
+ * GetPoolManagerHandle
+ * Connect to pool manager (through a UNIX socket).
+ *
+ * We know the pooler always runs on the same system (as it's just an
+ * auxiliary process forked from postmaster), so we only support UNIX
+ * sockets.
+ *
+ * XXX Perhaps this should fail at compile time when HAVE_UNIX_SOCKETS
+ * is not defined?
*/
static void
GetPoolManagerHandle(void)
@@ -343,8 +575,8 @@ GetPoolManagerHandle(void)
PoolHandle *handle;
int fdsock = -1;
+ /* do nothing if a session is already connected to pool manager */
if (poolHandle)
- /* already connected */
return;
#ifdef HAVE_UNIX_SOCKETS
@@ -409,10 +641,11 @@ GetPoolManagerHandle(void)
(errmsg("failed to connect to pool manager: %m")));
/*
- * Allocate handle
+ * Allocate the handle
+ *
+ * XXX We may change malloc to palloc here, but first ensure that
+ * the CurrentMemoryContext is set properly.
*
- * XXX we may change malloc here to palloc but first ensure
- * the CurrentMemoryContext is properly set.
* The handle allocated just before new session is forked off and
* inherited by the session process. It should remain valid for all
* the session lifetime.
@@ -432,7 +665,12 @@ GetPoolManagerHandle(void)
}
/*
- * Create agent
+ * agent_create
+ * Create a PoolAgent for a new session.
+ *
+ * PoolAgent represents the session within pool manager process. So when
+ * the session wants to communicate with the pool manager, it sends the
+ * data through PoolHandle, and pool manager responds through PoolAgent.
*/
static void
agent_create(void)
@@ -493,21 +731,22 @@ agent_create(void)
/*
* session_options
- * Returns the pgoptions string generated using a particular
- * list of parameters that are required to be propagated to Datanodes.
- * These parameters then become default values for the pooler sessions.
+ * Generates a pgoptions string to propagete to the other nodes.
+ *
+ * These parameters then become default values for the pooled sessions.
* For e.g., a psql user sets PGDATESTYLE. This value should be set
* as the default connection parameter in the pooler session that is
- * connected to the Datanodes. There are various parameters which need to
- * be analysed individually to determine whether these should be set on
- * Datanodes.
+ * connected to the other nodes.
*
- * Note: These parameters values are the default values of the particular
- * Coordinator backend session, and not the new values set by SET command.
+ * There are various parameters which need to be analysed individually
+ * to determine whether these should be tracked and propagated.
*
+ * Note: These parameters values are the default values of each backend
+ * session, and not the new values set by SET command. We simply get
+ * the default value using GetConfigOptionResetString().
*/
-
-char *session_options(void)
+char *
+session_options(void)
{
int i;
char *pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
@@ -547,8 +786,20 @@ char *session_options(void)
/*
- * Associate session with specified database and respective connection pool
- * Invoked from Session process
+ * PoolManagerConnect
+ * Connect session to a pool manager.
+ *
+ * Used from a backend to open a connection to the pool manager. The
+ * backends do not call this directly, though - it's called automatically
+ * from functions that need to communicate with the pool manager.
+ *
+ * Opens a communication channel by acquiring a "pool manger handle"
+ * (which opens a two-way connection through a UNIX socket), and then
+ * sends enough information (particularly dbname and username) to lookup
+ * the right connection pool.
+ *
+ * This only sends the message to the pool manager, but does not wait
+ * for response.
*/
static void
PoolManagerConnect(const char *database, const char *user_name,
@@ -561,7 +812,7 @@ PoolManagerConnect(const char *database, const char *user_name,
int pgoptionslen = strlen(pgoptions);
char atchar = ' ';
- /* Connect to the pooler process if not yet connected */
+ /* Make sure we're connected to the pool manager process.*/
GetPoolManagerHandle();
if (poolHandle == NULL)
ereport(ERROR,
@@ -573,9 +824,10 @@ PoolManagerConnect(const char *database, const char *user_name,
/*
* Special handling for db_user_namespace=on
+ *
* We need to handle per-db users and global users. The per-db users will
* arrive with @dbname and global users just as username. Handle both of
- * them appropriately
+ * them appropriately.
*/
if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
{
@@ -623,6 +875,7 @@ PoolManagerConnect(const char *database, const char *user_name,
}
else
pool_putbytes(&poolHandle->port, user_name, unamelen);
+
pool_putbytes(&poolHandle->port, "\0", 1);
/* Length of pgoptions string */
@@ -636,54 +889,11 @@ PoolManagerConnect(const char *database, const char *user_name,
}
/*
- * Reconnect to pool manager
- * It simply does a disconnection and a reconnection.
- */
-void
-PoolManagerReconnect(void)
-{
- elog(DEBUG1, "Reconnecting to PoolManager");
-
- /* Connected, disconnect */
- if (poolHandle)
- PoolManagerDisconnect();
-
- PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
- session_options());
-}
-
-/*
- * Lock/unlock pool manager
- * During locking, the only operations not permitted are abort, connection and
- * connection obtention.
- */
-void
-PoolManagerLock(bool is_lock)
-{
- char msgtype = 'o';
- int n32;
- int msglen = 8;
- if (poolHandle == NULL)
- PoolManagerConnect(get_database_name(MyDatabaseId),
- GetClusterUserName(), "");
-
- elog(DEBUG1, "Locking PoolManager");
-
- /* Message type */
- pool_putbytes(&poolHandle->port, &msgtype, 1);
-
- /* Message length */
- n32 = htonl(msglen);
- pool_putbytes(&poolHandle->port, (char *) &n32, 4);
-
- /* Lock information */
- n32 = htonl((int) is_lock);
- pool_putbytes(&poolHandle->port, (char *) &n32, 4);
- pool_flush(&poolHandle->port);
-}
-
-/*
- * Init PoolAgent
+ * agent_init
+ * Initialize a PoolAgent instance (allocate memory, etc.).
+ *
+ * Allocates memory for coordinator and datanode connections (in the
+ * per-agent memory context), and links it to the correct database pool.
*/
static void
agent_init(PoolAgent *agent, const char *database, const char *user_name,
@@ -695,6 +905,9 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
Assert(database);
Assert(user_name);
+ elog(DEBUG1, "Initializing PoolAgent (user_name %s, database %s, "
+ "pgoptions %s", user_name, database, pgoptions);
+
/* disconnect if we are still connected */
if (agent->pool)
agent_release_connections(agent, false);
@@ -709,20 +922,34 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
agent->dn_connections = (PGXCNodePoolSlot **)
palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
- /* find database */
+
+ /* find the right database pool */
agent->pool = find_database_pool(database, user_name, pgoptions);
/* create if not found */
if (agent->pool == NULL)
agent->pool = create_database_pool(database, user_name, pgoptions);
+ Assert(agent->pool);
+
MemoryContextSwitchTo(oldcontext);
return;
}
/*
- * Destroy PoolAgent
+ * agent_destroy
+ * Close remaining connections, release agent's memory.
+ *
+ * Under normal conditions, all connections managed by the agent should
+ * have been closed by this point. If there are some connections still
+ * associated with the agent, something must have gone wrong (error),
+ * in which case we have no idea in what state the connections are and
+ * we have no reliable / cheap way to find out. So just close them.
+ *
+ * XXX This is one of the places where we have to loop through the array
+ * of agents to find the "current" one. Seems expensive, especially when
+ * there are many short-lived sessions (as typical in OLTP).
*/
static void
agent_destroy(PoolAgent *agent)
@@ -733,17 +960,17 @@ agent_destroy(PoolAgent *agent)
close(Socket(agent->port));
- /* Discard connections if any remaining */
+ /*
+ * Release all connections the session might be still holding.
+ *
+ * If the session is disconnecting while still holding some open
+ * connections, we have no idea if those connections are clean
+ * or not. So force destroying them.
+ */
if (agent->pool)
- {
- /*
- * If session is disconnecting while there are active connections
- * we can not know if they clean or not, so force destroy them
- */
agent_release_connections(agent, true);
- }
- /* find agent in the list */
+ /* Remove the agent from the poolAgents array. */
for (i = 0; i < agentCount; i++)
{
if (poolAgents[i] == agent)
@@ -762,8 +989,13 @@ agent_destroy(PoolAgent *agent)
}
/*
- * Ping an UNHEALTHY node and if it succeeds, update SHARED node
- * information
+ * TryPingUnhealthyNode
+ * Try pinging a node marked as unhealthy, and update shared info.
+ *
+ * Try pinging a node previously marked as UNHEALTHY, and if it succeeds
+ * then update the SHARED node information (marking it as healthy).
+ *
+ * XXX Perhaps this should track timestamp of the last attempted ping?
*/
static void
TryPingUnhealthyNode(Oid nodeoid)
@@ -773,6 +1005,7 @@ TryPingUnhealthyNode(Oid nodeoid)
char connstr[MAXPGPATH * 2 + 256];
nodeDef = PgxcNodeGetDefinition(nodeoid);
+
if (nodeDef == NULL)
{
/* No such definition, node dropped? */
@@ -780,6 +1013,8 @@ TryPingUnhealthyNode(Oid nodeoid)
" skipping health check", nodeoid);
return;
}
+
+ /* XXX This fails to release the nodeDef, which is a memory leak. */
if (nodeDef->nodeishealthy)
{
/* hmm, can this happen? */
@@ -790,9 +1025,11 @@ TryPingUnhealthyNode(Oid nodeoid)
elog(LOG, "node (%s:%u) down! Trying ping",
NameStr(nodeDef->nodename), nodeoid);
+
sprintf(connstr,
"host=%s port=%d", NameStr(nodeDef->nodehost),
nodeDef->nodeport);
+
status = PGXCNodePing(connstr);
if (status != 0)
{
@@ -813,8 +1050,10 @@ TryPingUnhealthyNode(Oid nodeoid)
}
/*
- * Check if a node is indeed down and if it is update its UNHEALTHY
- * status
+ * PoolPingNodeRecheck
+ * Check if a node is down, and if it is then mark it as UNHEALTHY.
+ *
+ * XXX Move to pgxcnode.c (as static), it's not used anywhere else.
*/
void
PoolPingNodeRecheck(Oid nodeoid)
@@ -858,7 +1097,11 @@ PoolPingNodeRecheck(Oid nodeoid)
}
/*
- * Ping UNHEALTHY nodes as part of the maintenance window
+ * PoolPingNodes
+ * Ping nodes currently marked as UNHEALTHY.
+ *
+ * XXX Perhaps we should fetch only the unhealthy nodes, instead of
+ * fetching everything and then looping over them.
*/
void
PoolPingNodes()
@@ -875,7 +1118,7 @@ PoolPingNodes()
coHealthMap, dnHealthMap);
/*
- * Find unhealthy datanodes and try to re-ping them
+ * Find unhealthy datanodes and try to re-ping them.
*/
for (i = 0; i < numDn; i++)
{
@@ -885,8 +1128,9 @@ PoolPingNodes()
TryPingUnhealthyNode(nodeoid);
}
}
+
/*
- * Find unhealthy coordinators and try to re-ping them
+ * Find unhealthy coordinators and try to re-ping them.
*/
for (i = 0; i < numCo; i++)
{
@@ -898,8 +1142,18 @@ PoolPingNodes()
}
}
+/***********************************************************************
+ * Communication with a pool manager (sending messages through socket).
+ **********************************************************************/
+
+
/*
- * Release handle to pool manager
+ * PoolManagerDisconnect
+ * Close connection to the pool manager and reset it to NULL.
+ *
+ * When everything goes well, the session notifies the pool manager by
+ * sending an exit message ('d'), closes the port and releases all
+ * memory associated with it.
*/
void
PoolManagerDisconnect(void)
@@ -917,7 +1171,12 @@ PoolManagerDisconnect(void)
/*
- * Get pooled connections
+ * PoolManagerGetConnections
+ * Acquire connections for requested nodes, along with their PIDs.
+ *
+ * Acquires pooled connections for the specified nodes, and returns an
+ * array of file descriptors, representing connections to the nodes.
+ * It also provides array of PIDs of the backends (on remote nodes).
*/
int *
PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
@@ -926,20 +1185,27 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
ListCell *nodelist_item;
int *fds;
int totlen = list_length(datanodelist) + list_length(coordlist);
- int nodes[totlen + 2];
+ int nodes[totlen + 2]; /* node OIDs + two node counts */
+ /* Make sure we're connected to the pool manager. */
if (poolHandle == NULL)
PoolManagerConnect(get_database_name(MyDatabaseId),
GetClusterUserName(), session_options());
/*
- * Prepare end send message to pool manager.
- * First with Datanode list.
- * This list can be NULL for a query that does not need
- * Datanode Connections (Sequence DDLs)
+ * Prepare a message we send to the pool manager. We build it in the
+ * nodes array, as all the fields are int-sized.
+ *
+ * - number of datanodes
+ * - datanode OIDs
+ * - number of coordinators
+ * - coordinator OIDs
+ *
+ * The datanode list may be empty when the query does not need talk
+ * to datanodes (e.g. sequence DDL).
*/
- nodes[0] = htonl(list_length(datanodelist));
- i = 1;
+ i = 0;
+ nodes[i++] = htonl(list_length(datanodelist));
if (list_length(datanodelist) != 0)
{
foreach(nodelist_item, datanodelist)
@@ -947,7 +1213,11 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
nodes[i++] = htonl(lfirst_int(nodelist_item));
}
}
- /* Then with Coordinator list (can be nul) */
+
+ /*
+ * Similarly for coordinators, some queries don't need them and in
+ * that case the list may be NULL.
+ */
nodes[i++] = htonl(list_length(coordlist));
if (list_length(coordlist) != 0)
{
@@ -957,10 +1227,14 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
}
}
+ /*
+ * Send the encoded datanode/coordinator OIDs to the pool manager,
+ * flush the message nd wait for the response.
+ */
pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
pool_flush(&poolHandle->port);
- /* Receive response */
+ /* Allocate memory for file descriptors (node connections). */
fds = (int *) palloc(sizeof(int) * totlen);
if (fds == NULL)
{
@@ -968,14 +1242,19 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
+
+ /* receive file descriptors */
if (pool_recvfds(&poolHandle->port, fds, totlen))
{
+ elog(WARNING, "failed to receive file descriptors for connections");
pfree(fds);
fds = NULL;
}
+ /* receive PIDs for remote backends */
if (pool_recvpids(&poolHandle->port, pids) != totlen)
{
+ elog(WARNING, "failed to receive PIDs of remote backends");
pfree(*pids);
*pids = NULL;
return NULL;
@@ -984,16 +1263,26 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
return fds;
}
+
/*
- * Abort active transactions using pooler.
- * Take a lock forbidding access to Pooler for new transactions.
+ * PoolManagerAbortTransactions
+ * Abort active transactions on connections in a particular pool.
+ *
+ * Simply send an 'abort' message to the pool manager, which then aborts
+ * in-progress transaction on all connections in a matching DatabasePool
+ * (identified by dbname/username).
+ *
+ * Currently this point this only happens during CLEAN CONNECTION.
+ *
+ * An array of PIDs on which transactions were aborted is returned
+ * through the proc_pids argument, the number of PIDs as a return value.
*/
int
PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
{
int num_proc_ids = 0;
int n32, msglen;
- char msgtype = 'a';
+ char msgtype = 'a';
int dblen = dbname ? strlen(dbname) + 1 : 0;
int userlen = username ? strlen(username) + 1 : 0;
@@ -1039,10 +1328,12 @@ PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
/*
- * Clean up Pooled connections
+ * PoolManagerCleanConnection
+ * Performs a cleanup of pooled connections.
*/
void
-PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
+PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+ char *dbname, char *username)
{
int totlen = list_length(datanodelist) + list_length(coordlist);
int nodes[totlen + 2];
@@ -1052,16 +1343,25 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
int userlen = username ? strlen(username) + 1 : 0;
int dblen = dbname ? strlen(dbname) + 1 : 0;
- /*
- * New connection may be established to clean connections to
- * specified nodes and databases.
- */
+ /* Make sure we're connected to the pool manager. */
if (poolHandle == NULL)
PoolManagerConnect(get_database_name(MyDatabaseId),
GetClusterUserName(), session_options());
- nodes[0] = htonl(list_length(datanodelist));
- i = 1;
+ /*
+ * Prepare a message we send to the pool manager. We build it in the
+ * nodes array, as all the fields are int-sized.
+ *
+ * - number of datanodes
+ * - datanode OIDs
+ * - number of coordinators
+ * - coordinator OIDs
+ *
+ * The datanode list may be empty when the query does not need talk
+ * to datanodes (e.g. sequence DDL).
+ */
+ i = 0;
+ nodes[i++] = htonl(list_length(datanodelist));
if (list_length(datanodelist) != 0)
{
foreach(nodelist_item, datanodelist)
@@ -1069,7 +1369,11 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
nodes[i++] = htonl(lfirst_int(nodelist_item));
}
}
- /* Then with Coordinator list (can be nul) */
+
+ /*
+ * Similarly for coordinators, some queries don't need them and in
+ * that case the list may be NULL.
+ */
nodes[i++] = htonl(list_length(coordlist));
if (list_length(coordlist) != 0)
{
@@ -1117,21 +1421,32 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
/*
- * Check connection information consistency cached in pooler with catalog information
+ * PoolManagerCheckConnectionInfo
+ * Check that pool manager info is consistent with the node catalog.
+ *
+ * Check that information used by the pool manager (for open connections)
+ * is consistent with the system catalog.
+ *
+ * Returns 'true' when everything seems consistent, and 'false' in case
+ * of some inconsistency.
*/
bool
PoolManagerCheckConnectionInfo(void)
{
int res;
- /*
- * New connection may be established to clean connections to
- * specified nodes and databases.
- */
+ /* Make sure we're connected to the pool manager. */
if (poolHandle == NULL)
PoolManagerConnect(get_database_name(MyDatabaseId),
GetClusterUserName(), session_options());
+
+ /*
+ * The name is a bit misleading, but PgxcNodeListAndCount updates
+ * information about nodes in shared memory from system catalog.
+ */
PgxcNodeListAndCount();
+
+ /* Send message to the pool manager and wait for a response. */
pool_putmessage(&poolHandle->port, 'q', NULL, 0);
pool_flush(&poolHandle->port);
@@ -1145,7 +1460,8 @@ PoolManagerCheckConnectionInfo(void)
/*
- * Reload connection data in pooler and drop all the existing connections of pooler
+ * PoolManagerReloadConnectionInfo
+ * Reload connection metadata and close all open connections.
*/
void
PoolManagerReloadConnectionInfo(void)
@@ -1156,11 +1472,14 @@ PoolManagerReloadConnectionInfo(void)
pool_flush(&poolHandle->port);
}
+
/*
- * Refresh connection data in pooler and drop connections for those nodes
- * that have changed. Thus, this operation is less destructive as compared
- * to PoolManagerReloadConnectionInfo and should typically be called when
- * NODE ALTER has been performed
+ * PoolManagerRefreshConnectionInfo
+ * Refresh connection metadata and close stale connections.
+ *
+ * Unlike PoolManagerReloadConnectionInfo, this only closes connections
+ * to nodes where the metadata changed. Thus, this operation is less
+ * destructive, and should typically be called after NODE ALTER.
*/
int
PoolManagerRefreshConnectionInfo(void)
@@ -1180,6 +1499,17 @@ PoolManagerRefreshConnectionInfo(void)
return false;
}
+
+/***********************************************************************
+ * Handling of messages sent to the pool manager (through the socket).
+ **********************************************************************/
+
+/*
+ * handle_abort
+ * Handles 'abort transaction' action.
+ *
+ * The message is built and sent by PoolManagerAbortTransactions.
+ */
static void
handle_abort(PoolAgent * agent, StringInfo s)
{
@@ -1206,6 +1536,15 @@ handle_abort(PoolAgent * agent, StringInfo s)
pfree(pids);
}
+/*
+ * handle_connect
+ * Initializes a PoolAgent object and associates is with a pool.
+ *
+ * Once the connect is complete, the agent is associated with a database
+ * pool and can provide pooled connections.
+ *
+ * The message is built and sent by PoolManagerConnect.
+ */
static void
handle_connect(PoolAgent * agent, StringInfo s)
{
@@ -1226,14 +1565,19 @@ handle_connect(PoolAgent * agent, StringInfo s)
len = pq_getmsgint(s, 4);
pgoptions = pq_getmsgbytes(s, len);
- /*
- * Coordinator pool is not initialized.
- * With that it would be impossible to create a Database by default.
- */
+ /* Initialize the agent - find the proper DatabasePool, etc. */
agent_init(agent, database, user_name, pgoptions);
+
+ /* XXX Shouldn't this be before the agent_init? */
pq_getmsgend(s);
}
+/*
+ * handle_clean_connection
+ * Handles CLEAN CONNECTION command.
+ *
+ * The message is built and sent by PoolManagerCleanConnection.
+ */
static void
handle_clean_connection(PoolAgent * agent, StringInfo s)
{
@@ -1275,15 +1619,21 @@ handle_clean_connection(PoolAgent * agent, StringInfo s)
pq_getmsgend(s);
- /* Clean up connections here */
+ /* perform the actual connection cleanup */
res = clean_connection(nodelist, database, user_name);
list_free(nodelist);
- /* Send success result */
+ /* send result (success/failure) back */
pool_sendres(&agent->port, res);
}
+/*
+ * handle_get_connections
+ * Acquire pooled connections to the specified nodes.
+ *
+ * The message is built and sent by PoolManagerGetConnections.
+ */
static void
handle_get_connections(PoolAgent * agent, StringInfo s)
{
@@ -1294,22 +1644,26 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
List *coordlist = NIL;
/*
- * Length of message is caused by:
- * - Message header = 4bytes
- * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
- * - List of Coordinators = NumPoolCoords * 4bytes (max)
- * - Number of Datanodes sent = 4bytes
- * - Number of Coordinators sent = 4bytes
- * It is better to send in a same message the list of Co and Dn at the same
- * time, this permits to reduce interactions between postmaster and pooler
+ * The message consists of:
+ *
+ * - Message header = 4B
+ * - Number of Datanodes sent = 4B
+ * - List of Datanodes = NumPoolDataNodes * 4B (max)
+ * - Number of Coordinators sent = 4B
+ * - List of Coordinators = NumPoolCoords * 4B (max)
*/
+
pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+ /* decode the datanode OIDs */
datanodecount = pq_getmsgint(s, 4);
for (i = 0; i < datanodecount; i++)
datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
- /* It is possible that no Coordinators are involved in the transaction */
+ /*
+ * decode the coordinator OIDs (there may be none, if no coordinators
+ * are involved in the transaction)
+ */
coordcount = pq_getmsgint(s, 4);
for (i = 0; i < coordcount; i++)
coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
@@ -1327,19 +1681,23 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
list_free(datanodelist);
list_free(coordlist);
+ /* Send the file descriptors back, along with the correct count. */
pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
if (fds)
pfree(fds);
- /*
- * Also send the PIDs of the remote backend processes serving
- * these connections
- */
+ /* Also send PIDs of the remote backends serving the connections. */
pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
if (pids)
pfree(pids);
}
+/*
+ * handle_query_cancel
+ * Cancel query executed on connections associated with the agent.
+ *
+ * PoolManagerCancelQuery
+ */
static void
handle_query_cancel(PoolAgent * agent, StringInfo s)
{
@@ -1378,7 +1736,8 @@ handle_query_cancel(PoolAgent * agent, StringInfo s)
}
/*
- * Handle messages to agent
+ * agent_handle_input
+ * Handle messages passed to the pool agent from PoolerLoop().
*/
static void
agent_handle_input(PoolAgent * agent, StringInfo s)
@@ -1500,7 +1859,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
}
/*
- * acquire connection
+ * agent_acquire_connections
+ * Acquire connections to specified nodes, associate them with agent.
+ *
+ * Returns an array of file descriptors representing the connections, with
+ * order matching the datanode/coordinator list. Also returns an array of
+ * backend PIDs, handling those connections (on the remote nodes).
*/
static int *
agent_acquire_connections(PoolAgent *agent, List *datanodelist,
@@ -1526,12 +1890,17 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
}
/*
- * Allocate memory
- * File descriptors of Datanodes and Coordinators are saved in the same array,
- * This array will be sent back to the postmaster.
- * It has a length equal to the length of the Datanode list
- * plus the length of the Coordinator list.
- * Datanode fds are saved first, then Coordinator fds are saved.
+ * Allocate memory for the file descriptors and backend PIDs.
+ *
+ * File descriptors of datanodes and coordinators are both saved in
+ * a single array, which is then sent back to the backend. Datanodes
+ * are stored first, coordinators second, and the order matches the
+ * order of input lists.
+ *
+ * And similarly for the PIDs - single array, datanodes first.
+ *
+ * XXX How expensive is it to do the list_length over and over? Maybe
+ * do the count once and then use the value elsewhere?
*/
result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
if (result == NULL)
@@ -1550,15 +1919,13 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
}
/*
- * There are possible memory allocations in the core pooler, we want
- * these allocations in the contect of the database pool
+ * Make sure the results (connections) are allocated in the memory
+ * context for the DatabasePool.
*/
oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
-
- /* Initialize result */
+ /* first open connections to the datanodes */
i = 0;
- /* Save in array fds of Datanodes first */
foreach(nodelist_item, datanodelist)
{
int node = lfirst_int(nodelist_item);
@@ -1586,6 +1953,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
* Update newly-acquired slot with session parameters.
* Local parameters are fired only once BEGIN has been launched on
* remote nodes.
+ *
+ * FIXME Perhaps we should be doing something here?
*/
}
@@ -1593,7 +1962,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
(*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
}
- /* Save then in the array fds for Coordinators */
+ /* make sure we got the expected number of datanode connections */
+ Assert(i == list_length(datanodelist));
+
+ /* and then the coordinators */
foreach(nodelist_item, coordlist)
{
int node = lfirst_int(nodelist_item);
@@ -1620,6 +1992,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
* Update newly-acquired slot with session parameters.
* Local parameters are fired only once BEGIN has been launched on
* remote nodes.
+ *
+ * FIXME Perhaps we should be doing something here?
*/
}
@@ -1629,11 +2003,15 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
MemoryContextSwitchTo(oldcontext);
+ /* make sure we got the expected total number of connections */
+ Assert(i == list_length(datanodelist) + list_length(coordlist));
+
return result;
}
/*
- * Cancel query
+ * cancel_query_on_connections
+ * Cancel query running on connections managed by a PoolAgent.
*/
static int
cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
@@ -1706,7 +2084,8 @@ cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlis
}
/*
- * Return connections back to the pool
+ * PoolManagerReleaseConnections
+ * Return all connections back to the pool.
*/
void
PoolManagerReleaseConnections(bool force)
@@ -1715,7 +2094,10 @@ PoolManagerReleaseConnections(bool force)
int n32;
int msglen = 8;
- /* If disconnected from pooler all the connections already released */
+ /*
+ * If disconnected from the pool manager, all the connections were
+ * already released.
+ */
if (!poolHandle)
return;
@@ -1735,7 +2117,8 @@ PoolManagerReleaseConnections(bool force)
}
/*
- * Cancel Query
+ * PoolManagerCancelQuery
+ * Cancel query on all nodes where it's running.
*/
void
PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
@@ -1794,7 +2177,10 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
}
/*
- * Release connections for Datanodes and Coordinators
+ * agent_release_connections
+ * Release connections associated with a PoolAgent instance.
+ *
+ *
*/
static void
agent_release_connections(PoolAgent *agent, bool force_destroy)
@@ -1802,8 +2188,15 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
MemoryContext oldcontext;
int i;
+ /* If there are no open connections in the agent, we're done. */
if (!agent->dn_connections && !agent->coord_connections)
return;
+
+ /*
+ * In PAUSED cluster (see src/backend/pgxc/cluster/pause.c) we can't
+ * return any connections to the connection pools, we can only close
+ * them, so we require 'force'.
+ */
if (!force_destroy && cluster_ex_lock_held)
{
elog(LOG, "Not releasing connection with cluster lock");
@@ -1811,29 +2204,33 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
}
/*
- * There are possible memory allocations in the core pooler, we want
- * these allocations in the contect of the database pool
+ * Make sure all allocations happen in the DatabasePool memory context
+ * (and not for example in the main pooler context, which would cause
+ * memory leaks, or in caller's context, likely causing crashes).
*/
oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
/*
- * Remaining connections are assumed to be clean.
- * First clean up for Datanodes
+ * All currently open connections are assumed to be 'clean' so just
+ * return them back to the pool (or close them, with force_destroy).
+ * First the datanodes, then coordinators.
*/
for (i = 0; i < agent->num_dn_connections; i++)
{
PGXCNodePoolSlot *slot = agent->dn_connections[i];
/*
- * Release connection.
+ * Release the connection.
+ *
* If connection has temporary objects on it, destroy connection slot.
*/
if (slot)
release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
+
agent->dn_connections[i] = NULL;
elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
}
- /* Then clean up for Coordinator connections */
+
for (i = 0; i < agent->num_coord_connections; i++)
{
PGXCNodePoolSlot *slot = agent->coord_connections[i];
@@ -1844,6 +2241,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
*/
if (slot)
release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
+
agent->coord_connections[i] = NULL;
elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
}
@@ -1851,7 +2249,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
/*
* Released connections are now in the pool and we may want to close
* them eventually. Update the oldest_idle value to reflect the latest
- * last access time if not already updated..
+ * last access time if not already updated.
*/
if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
agent->pool->oldest_idle = time(NULL);
@@ -1859,13 +2257,24 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
MemoryContextSwitchTo(oldcontext);
}
+
+/***********************************************************************
+ * Pool Management
+ **********************************************************************/
+
/*
- * Create new empty pool for a database.
- * By default Database Pools have a size null so as to avoid interactions
- * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
- * Pool is increased at the first GET_CONNECTION message received.
- * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
- * error and POOL_WEXIST if poll for this database already exist.
+ * create_database_pool
+ * Create new empty pool for a database/user combination.
+ *
+ * We only initialize the database pool and add it to the global list,
+ * but do not try to preallocate any connections. That only happens when
+ * the first request for connection arrives.
+ *
+ * Returns a pointer to the new DatabasePool in case of success, NULL
+ * when something fails (out of memory, etc.)
+ *
+ * XXX Should we add some protection against duplicate pools? Probably
+ * not really necessary.
*/
static DatabasePool *
create_database_pool(const char *database, const char *user_name, const char *pgoptions)
@@ -1878,14 +2287,18 @@ create_database_pool(const char *database, const char *user_name, const char *pg
elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
" with pgoptions %s", database, user_name, pgoptions);
+ /* create a memory context for the database pool */
dbcontext = AllocSetContextCreate(PoolerCoreContext,
- "DB Context",
+ "Database Pool Context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
+
oldcontext = MemoryContextSwitchTo(dbcontext);
- /* Allocate memory */
+
+ /* Allocate memory (already in the dbpool memory context) */
databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+
if (!databasePool)
{
/* out of memory */
@@ -1896,15 +2309,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
}
databasePool->mcxt = dbcontext;
- /* Copy the database name */
+
+ /* copy the basic details about the pool */
databasePool->database = pstrdup(database);
- /* Copy the user name */
databasePool->user_name = pstrdup(user_name);
- /* Reset the oldest_idle value */
- databasePool->oldest_idle = (time_t) 0;
- /* Copy the pgoptions */
databasePool->pgoptions = pstrdup(pgoptions);
+ /* reset the oldest_idle value */
+ databasePool->oldest_idle = (time_t) 0;
+
+ /* FIXME We should check all the parameters we just copied. */
if (!databasePool->database)
{
/* out of memory */
@@ -1931,7 +2345,7 @@ create_database_pool(const char *database, const char *user_name, const char *pg
MemoryContextSwitchTo(oldcontext);
- /* Insert into the list */
+ /* insert the new database pool into the global list */
insert_database_pool(databasePool);
return databasePool;
@@ -1939,7 +2353,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
/*
- * Destroy the pool and free memory
+ * destroy_database_pool
+ * Destroy a database pool for a user/dbname combination.
+ *
+ * When a matching database pool exists, we destroy all the node pools
+ * (which closes all the connection), and release the memory context.
+ *
+ * Returns 1 in case of success (when pool exists), 0 when a matching
+ * pool was not found.
+ *
+ * XXX Maybe return true/false instead?
*/
static int
destroy_database_pool(const char *database, const char *user_name)
@@ -1965,19 +2388,28 @@ destroy_database_pool(const char *database, const char *user_name)
MemoryContextDelete(databasePool->mcxt);
return 1;
}
+
+ elog(DEBUG1, "Connection pool for database %s, user %s not found",
+ database, user_name);
+
return 0;
}
/*
- * Insert new database pool to the list
+ * insert_database_pool
+ * Insert the newly created pool to the head of the global pool list.
*/
static void
insert_database_pool(DatabasePool *databasePool)
{
Assert(databasePool);
- /* Reference existing list or null the tail */
+ /*
+ * Reference existing list or null the tail
+ *
+ * XXX The 'if' seems somewhat unnecessary I guess ...
+ */
if (databasePools)
databasePool->next = databasePools;
else
@@ -1989,7 +2421,10 @@ insert_database_pool(DatabasePool *databasePool)
/*
* reload_database_pools
- * rebuild connection information for all database pools
+ * Rebuild connection information for all database pools.
+ *
+ * Connection information reload applies to all database pools (not
+ * just the one associated with a the current pool agent).
*
* A database pool is reloaded as follows for each remote node:
*
@@ -1999,21 +2434,22 @@ insert_database_pool(DatabasePool *databasePool)
* - node pool is deleted if its port or host information is changed.
* Subsequently all its connections are dropped.
*
- * - node pool is kept unchanged with existing connection information
- * is not changed. However its index position in node pool is changed
- * according to the alphabetical order of the node name in new
- * cluster configuration.
+ * - node pool is kept unchanged if the connection information has not
+ * changed. However its index position in node pool changes according
+ * to the alphabetical order of the node name in new configuration.
*
* Backend sessions are responsible to reconnect to the pooler to update
* their agent with newest connection information.
*
- * The session invocating connection information reload is reconnected
- * and uploaded automatically after database pool reload. Other server
- * sessions are signaled to reconnect to pooler and update their
- * connection information separately.
+ * The session that triggered the connection metadata reload reconnects
+ * automatically after the reload. Other server sessions are signaled
+ * to reconnect to pooler and update their connection info separately.
*
* During reload process done internally on pooler, pooler is locked
* to forbid new connection requests.
+ *
+ * XXX Where does the locking happen?
+ * XXX Where do we signal the other sessions?
*/
static void
reload_database_pools(PoolAgent *agent)
@@ -2023,26 +2459,29 @@ reload_database_pools(PoolAgent *agent)
elog(DEBUG1, "Reloading database pools");
/*
- * Release node connections if any held. It is not guaranteed client session
- * does the same so don't ever try to return them to pool and reuse
+ * Release node connections if any held. It is not guaranteed client
+ * session does the same so we don't ever try to return them to pool
+ * for reuse, and instead just close them.
*/
agent_release_connections(agent, true);
/* Forget previously allocated node info */
MemoryContextReset(agent->mcxt);
- /* and allocate new */
+ /* And allocate a blank copy. */
PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
- &agent->num_coord_connections, &agent->num_dn_connections, false);
+ &agent->num_coord_connections, &agent->num_dn_connections,
+ false);
agent->coord_connections = (PGXCNodePoolSlot **)
palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+
agent->dn_connections = (PGXCNodePoolSlot **)
palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
/*
- * Scan the list and destroy any altered pool. They will be recreated
- * upon subsequent connection acquisition.
+ * Scan the list of database pools and destroy any altered pool. The
+ * pools will be recreated upon subsequent connection acquisition.
*/
databasePool = databasePools;
while (databasePool)
@@ -2074,18 +2513,19 @@ reload_database_pools(PoolAgent *agent)
/*
* refresh_database_pools
- * refresh information for all database pools
+ * Refresh information for all database pools.
+ *
+ * Connection information refresh applies to all database pools (not
+ * just the one associated with a the current pool agent).
*
- * Connection information refresh concerns all the database pools.
* A database pool is refreshed as follows for each remote node:
*
* - node pool is deleted if its port or host information is changed.
* Subsequently all its connections are dropped.
*
- * If any other type of activity is found, we error out.
- *
- * XXX I don't see any cases that would error out. Isn't the comment
- * simply obsolete?
+ * If any other type of activity is found (e.g. removed or deleted node)
+ * we error out (and return POOL_REFRESH_FAILED). In case of success we
+ * return POOL_REFRESH_SUCCESS.
*/
static int
refresh_database_pools(PoolAgent *agent)
@@ -2117,7 +2557,7 @@ refresh_database_pools(PoolAgent *agent)
/*
* Scan the list and destroy any altered pool. They will be recreated
- * upon subsequent connection acquisition.
+ * automatically upon subsequent connection acquisition.
*/
databasePool = databasePools;
while (res == POOL_REFRESH_SUCCESS && databasePool)
@@ -2132,7 +2572,10 @@ refresh_database_pools(PoolAgent *agent)
/*
* Since we re-checked the numbers above, we should not get
- * the case of an ADDED or a DELETED node here..
+ * the case of an ADDED or a DELETED node here.
+ *
+ * Newly added nodes are detected indirectly (same node count
+ * and no deleted nodes means no added nodes either).
*/
if (connstr_chk == NULL)
{
@@ -2145,10 +2588,10 @@ refresh_database_pools(PoolAgent *agent)
if (strcmp(connstr_chk, nodePool->connstr))
{
elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
+
/*
- * Node has been altered. First remove
- * all references to this node from ALL the
- * agents before destroying it..
+ * Node has been altered. First remove all references to
+ * this node from ALL the agents before destroying it.
*/
if (!remove_all_agent_references(nodePool->nodeoid))
{
@@ -2156,6 +2599,7 @@ refresh_database_pools(PoolAgent *agent)
break;
}
+ /* And now destroy the node pool. */
destroy_node_pool(nodePool);
hash_search(databasePool->nodePools, &nodePool->nodeoid,
HASH_REMOVE, NULL);
@@ -2167,9 +2611,17 @@ refresh_database_pools(PoolAgent *agent)
databasePool = databasePool->next;
}
+
return res;
}
+/*
+ * remove_all_agent_references
+ * Remove all references to a specified node from all PoolAgents.
+ *
+ * XXX This is yet another place unnecesserily complicated by keeping
+ * datanodes and coordinators separate.
+ */
static bool
remove_all_agent_references(Oid nodeoid)
{
@@ -2177,8 +2629,7 @@ remove_all_agent_references(Oid nodeoid)
bool res = true;
/*
- * Identify if it's a coordinator or datanode first
- * and get its index
+ * Identify if it's a coordinator or datanode first and get its index.
*/
for (i = 1; i <= agentCount; i++)
{
@@ -2228,14 +2679,20 @@ remove_all_agent_references(Oid nodeoid)
}
/*
- * Find pool for specified database and username in the list
+ * find_database_pool
+ * Find a DatabasePool for specified database/username combination.
+ *
+ * Returns a pointer to the database pool if it exists, NULL otherwise.
*/
static DatabasePool *
-find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+find_database_pool(const char *database, const char *user_name,
+ const char *pgoptions)
{
DatabasePool *databasePool;
- /* Scan the list */
+ Assert(database && user_name && pgoptions);
+
+ /* scan the list */
databasePool = databasePools;
while (databasePool)
{
@@ -2243,14 +2700,21 @@ find_database_pool(const char *database, const char *user_name, const char *pgop
strcmp(user_name, databasePool->user_name) == 0 &&
strcmp(pgoptions, databasePool->pgoptions) == 0)
break;
+
databasePool = databasePool->next;
}
+
return databasePool;
}
/*
- * Remove pool for specified database from the list
+ * remove_database_pool
+ * Remove database pool for database/username combination from the list.
+ *
+ * Only removes the pool from the global list, but does not destroy it.
+ * This allows doing additional maintenance on the database pool (e.g.
+ * destroy all the node pools, etc.)
*/
static DatabasePool *
remove_database_pool(const char *database, const char *user_name)
@@ -2258,21 +2722,24 @@ remove_database_pool(const char *database, const char *user_name)
DatabasePool *databasePool,
*prev;
+ Assert(database && user_name);
+
/* Scan the list */
databasePool = databasePools;
prev = NULL;
while (databasePool)
{
- /* if match break the loop and return */
+ /* if the pool matches, break the loop */
if (strcmp(database, databasePool->database) == 0 &&
strcmp(user_name, databasePool->user_name) == 0)
break;
+
prev = databasePool;
databasePool = databasePool->next;
}
- /* if found */
+ /* if found a matching pool, remove it from the list */
if (databasePool)
{
@@ -2285,11 +2752,29 @@ remove_database_pool(const char *database, const char *user_name)
databasePool->next = NULL;
}
+ else
+ elog(LOG, "database pool for %s/%s not found",
+ database, user_name);
+
+
return databasePool;
}
/*
- * Acquire connection
+ * acquire_connection
+ * Acquire connection to a given node from a specified pool.
+ *
+ * The node connection is acquired in one of two ways:
+ *
+ * (a) By reusing a connection already available in the connection pool.
+ *
+ * (b) By opening a fresh connection (when freeSize==0).
+ *
+ * Returns a PGXCNodePoolSlot pointer in case of success, NULL when the
+ * connection can't be obtained.
+ *
+ * Also updates node health information in the shared memory, both in
+ * case of success (healthy) or failure (unhealthy).
*/
static PGXCNodePoolSlot *
acquire_connection(DatabasePool *dbPool, Oid node)
@@ -2298,21 +2783,25 @@ acquire_connection(DatabasePool *dbPool, Oid node)
PGXCNodePoolSlot *slot;
Assert(dbPool);
+ Assert(OidIsValid(node));
- nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
- NULL);
+ /* see if we have pool for the node */
+ nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+ HASH_FIND, NULL);
/*
- * When a Coordinator pool is initialized by a Coordinator Postmaster,
- * it has a NULL size and is below minimum size that is 1
- * This is to avoid problems of connections between Coordinators
- * when creating or dropping Databases.
+ * If there are no free connections in the node pool, grow it.
+ *
+ * Coordinator pools initialized by a coordinator postmaster are
+ * initially empty. This is to avoid problems of connections between
+ * coordinators when creating or dropping databases.
*/
if (nodePool == NULL || nodePool->freeSize == 0)
nodePool = grow_pool(dbPool, node);
slot = NULL;
- /* Check available connections */
+
+ /* check available connections */
while (nodePool && nodePool->freeSize > 0)
{
int poll_result;
@@ -2323,14 +2812,26 @@ acquire_connection(DatabasePool *dbPool, Oid node)
if (PQsocket((PGconn *) slot->conn) > 0)
{
/*
- * Make sure connection is ok, destroy connection slot if there is a
- * problem.
+ * Check if the connection is ok, destroy the connection
+ * slot if there is a problem.
+ *
+ * XXX Not sure how expensive this is, but perhaps we should
+ * check the connections differently (not in the hot path
+ * when requesting the connection, when every instruction
+ * makes a difference). This seems particularly pointless
+ * when the connection was just opened by grow_pool().
+ *
+ * XXX Perhaps we can do this only when the connection is
+ * old enough (e.g. using slot->released)?
*/
poll_result = pqReadReady((PGconn *) slot->conn);
+ /* ok, no data - we have a working connection */
if (poll_result == 0)
- break; /* ok, no data */
- else if (poll_result < 0)
+ break;
+
+ /* something went wrong - retry, if possible */
+ if (poll_result < 0)
{
if (errno == EAGAIN || errno == EINTR)
goto retry;
@@ -2346,6 +2847,7 @@ acquire_connection(DatabasePool *dbPool, Oid node)
/* Decrement current max pool size */
(nodePool->size)--;
+
/* Ensure we are not below minimum size */
nodePool = grow_pool(dbPool, node);
}
@@ -2355,8 +2857,8 @@ acquire_connection(DatabasePool *dbPool, Oid node)
elog(WARNING, "can not connect to node %u", node);
/*
- * before returning, also update the shared health
- * status field to indicate that this node is down
+ * Before returning, update the node health status in shared
+ * memory to indicate this node is down.
*/
if (!PgxcNodeUpdateHealth(node, false))
elog(WARNING, "Could not update health status of node %u", node);
@@ -2364,6 +2866,10 @@ acquire_connection(DatabasePool *dbPool, Oid node)
elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
}
else
+ /*
+ * XXX Is this necessary? Isn't this just another source of latency
+ * in the connection-acquisition path?
+ */
PgxcNodeUpdateHealth(node, true);
return slot;
@@ -2371,7 +2877,13 @@ acquire_connection(DatabasePool *dbPool, Oid node)
/*
- * release connection from specified pool and slot
+ * release_connection
+ * Return a connection to a pool, or close it entirely.
+ *
+ * Release a connection - either return it back to the database pool
+ * (or more precisely to the node pool in that database pool), or force
+ * closing it (necessary for example when the session fails and we are
+ * not sure whether the connection is in consistent state).
*/
static void
release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
@@ -2381,40 +2893,61 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
Assert(dbPool);
Assert(slot);
+ Assert(OidIsValid(node));
+
+ nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+ HASH_FIND, NULL);
- nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
- NULL);
+ /*
+ * When the node pool does not exist, the node was probably either
+ * dropped or altered. In both cases the connection is no longer
+ * valid, so just close it.
+ */
if (nodePool == NULL)
{
- /*
- * The node may be altered or dropped.
- * In any case the slot is no longer valid.
- */
+ elog(WARNING, "Node pool (%d) does not exist anymore, closing connection",
+ node);
+
destroy_slot(slot);
return;
}
- /* return or discard */
+ /*
+ * The node pool exists, but we've been asked to forcefully close
+ * the connection, so do as asked.
+ */
if (!force_destroy)
{
- /* Insert the slot into the array and increase pool size */
- nodePool->slot[(nodePool->freeSize)++] = slot;
- slot->released = time(NULL);
- }
- else
- {
- elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+ elog(DEBUG1, "Cleaning up connection from pool %s (node %d), closing",
+ nodePool->connstr, node);
+
destroy_slot(slot);
+
/* Decrement pool size */
(nodePool->size)--;
+
/* Ensure we are not below minimum size */
grow_pool(dbPool, node);
+
+ return;
}
+
+ /*
+ * Everything peachy, so just insert the connection (slot) into the
+ * array and increase the number of free connections in the pool.
+ * Also note the timestamp when the connection was released.
+ */
+ nodePool->slot[(nodePool->freeSize)++] = slot;
+ slot->released = time(NULL);
}
/*
- * Increase database pool size, create new if does not exist
+ * grow_pool
+ * Increase size of a pool for a particular node if needed.
+ *
+ * If the node pool (for the specified node) does not exist, it will be
+ * created automatically.
*/
static PGXCNodePool *
grow_pool(DatabasePool *dbPool, Oid node)
@@ -2425,10 +2958,18 @@ grow_pool(DatabasePool *dbPool, Oid node)
bool found;
Assert(dbPool);
+ Assert(OidIsValid(node));
+ /* lookup node pool, create it if needed */
nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
HASH_ENTER, &found);
+
+ /*
+ * XXX Aren't we calling this even when the connstr already exists?
+ * Seems a bit wasteful, I guess.
+ */
nodePool->connstr = build_node_conn_str(node, dbPool);
+
if (!nodePool->connstr)
{
ereport(ERROR,
@@ -2436,6 +2977,10 @@ grow_pool(DatabasePool *dbPool, Oid node)
errmsg("could not build connection string for node %u", node)));
}
+ /*
+ * XXX Shouldn't this really be called right after the hash_search
+ * (and before we do the build_node_conn_str)?
+ */
if (!found)
{
nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
@@ -2449,6 +2994,11 @@ grow_pool(DatabasePool *dbPool, Oid node)
nodePool->size = 0;
}
+ /*
+ * If there are no free connections, try to create one. But do not
+ * exceed MaxPoolSize, i.e. the maximum number of connections in
+ * a node pool.
+ */
while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
{
PGXCNodePoolSlot *slot;
@@ -2475,16 +3025,22 @@ grow_pool(DatabasePool *dbPool, Oid node)
" connection error (%s)",
nodePool->connstr,
PQerrorMessage((PGconn*) slot->conn))));
+
destroy_slot(slot);
+
/*
- * If we failed to connect probably number of connections on the
- * target node reached max_connections. Try and release idle
- * connections and try again.
- * We do not want to enter endless loop here and run maintenance
- * procedure only once.
- * It is not safe to run the maintenance procedure if no connections
- * from that pool currently in use - the node pool may be destroyed
- * in that case.
+ * If we failed to connect, probably number of connections on
+ * the target node reached max_connections. Release idle from
+ * this node, and retry.
+ *
+ * We do not want to enter endless loop here, so we only try
+ * releasing idle connections once.
+ *
+ * It is not safe to run the maintenance from a pool with no
+ * active connections, as the maintenance might kill the pool.
+ *
+ * XXX Maybe temporarily marking the pool, so that it does not
+ * get removed (pinned=true) would do the trick?
*/
if (tryagain && nodePool->size > nodePool->freeSize)
{
@@ -2497,24 +3053,34 @@ grow_pool(DatabasePool *dbPool, Oid node)
slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
slot->released = time(NULL);
+
+ /*
+ * No need to compare the oldest_idle here, as every existing
+ * idle connection is automatically older than the new one. Only
+ * if there are no other idle connections this one is the oldest.
+ */
if (dbPool->oldest_idle == (time_t) 0)
dbPool->oldest_idle = slot->released;
- /* Insert at the end of the pool */
+ /* Insert the new slot to the last place in the node pool. */
nodePool->slot[(nodePool->freeSize)++] = slot;
- /* Increase count of pool size */
+ /* Increase the size of the node pool. */
(nodePool->size)++;
- elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+
+ elog(DEBUG1, "Pooler: increased pool size to %d for pool %s (%u)",
nodePool->size,
- nodePool->connstr);
+ nodePool->connstr,
+ node);
}
+
return nodePool;
}
/*
- * Destroy pool slot
+ * destroy_slot
+ * Destroy a connection slot (free cancel info and the slot itself).
*/
static void
destroy_slot(PGXCNodePoolSlot *slot)
@@ -2529,7 +3095,10 @@ destroy_slot(PGXCNodePoolSlot *slot)
/*
- * Destroy node pool
+ * destroy_node_pool
+ * Close any remaining connections to the node and destroy the slots.
+ *
+ * XXX This does not release the node_pool itself. Not sure if correct.
*/
static void
destroy_node_pool(PGXCNodePool *node_pool)
@@ -2546,6 +3115,7 @@ destroy_node_pool(PGXCNodePool *node_pool)
*/
elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+
if (node_pool->connstr)
pfree(node_pool->connstr);
@@ -2553,13 +3123,21 @@ destroy_node_pool(PGXCNodePool *node_pool)
{
for (i = 0; i < node_pool->freeSize; i++)
destroy_slot(node_pool->slot[i]);
+
pfree(node_pool->slot);
}
}
/*
- * Main handling loop
+ * PoolerLoop
+ * Main handling loop of the pool manager.
+ *
+ * Has three main responsibilities:
+ *
+ * - triggering regular pool maintenance
+ * - responding to postmaster events (e.g. shutdown)
+ * - forwarding messages to pool agents (which do handle them)
*/
static void
PoolerLoop(void)
@@ -2725,7 +3303,7 @@ PoolerLoop(void)
/*
* Agent may be removed from the array while processing
* and trailing items are shifted, so scroll downward
- * to avoid problem
+ * to avoid problems.
*/
for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
{
@@ -2737,6 +3315,7 @@ PoolerLoop(void)
agent_handle_input(agent, &input_message);
}
+ /* New session without an existing agent. */
if (pool_fd[0].revents & POLLIN)
agent_create();
}
@@ -2751,9 +3330,18 @@ PoolerLoop(void)
}
/*
- * Clean Connection in all Database Pools for given Datanode and Coordinator list
+ * clean_connection
+ * Clean connections for specified nodes in matching database pool.
+ *
+ * The function closes all unused connections to nodes specified in the
+ * node_discard list, in all database pools for the dbname/username
+ * combination. There may be multiple matching pools, with different
+ * pgoptions values.
+ *
+ * XXX The code handles NULL values in database/username, but not sure
+ * if that's really needed?
*/
-int
+static int
clean_connection(List *node_discard, const char *database, const char *user_name)
{
DatabasePool *databasePool;
@@ -2766,7 +3354,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
ListCell *lc;
if ((database && strcmp(database, databasePool->database)) ||
- (user_name && strcmp(user_name, databasePool->user_name)))
+ (user_name && strcmp(user_name, databasePool->user_name)))
{
/* The pool does not match to request, skip */
databasePool = databasePool->next;
@@ -2789,12 +3377,12 @@ clean_connection(List *node_discard, const char *database, const char *user_name
/* Check if connections are in use */
if (nodePool->freeSize < nodePool->size)
{
- elog(WARNING, "Pool of Database %s is using Datanode %u connections",
+ elog(WARNING, "Pool of database %s is using node %u connections",
databasePool->database, node);
res = CLEAN_CONNECTION_NOT_COMPLETED;
}
- /* Destroy connections currently in Node Pool */
+ /* Destroy unused connections in this Node Pool */
if (nodePool->slot)
{
int i;
@@ -2806,6 +3394,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
}
}
+ /* XXX Can there be multiple database pools? */
databasePool = databasePool->next;
}
@@ -2815,11 +3404,14 @@ clean_connection(List *node_discard, const char *database, const char *user_name
}
/*
- * Take a Lock on Pooler.
- * Abort PIDs registered with the agents for the given database.
- * Send back to client list of PIDs signaled to watch them.
+ * abort_pids
+ * Aborts backends associated with agents for a database/user.
+ *
+ * Ignores the current backend (otherwise it might cancel itself), and
+ * returns an array of PIDs that were actually signalled, so that the
+ * client can watch them. Number of the PIDs is passed in 'len'.
*/
-int *
+static int *
abort_pids(int *len, int pid, const char *database, const char *user_name)
{
int *pids = NULL;
@@ -2858,7 +3450,7 @@ abort_pids(int *len, int pid, const char *database, const char *user_name)
}
/*
- *
+ * Request shutdown of the pooler.
*/
static void
pooler_die(SIGNAL_ARGS)
@@ -2868,7 +3460,7 @@ pooler_die(SIGNAL_ARGS)
/*
- *
+ * Request quick shutdown of the pooler.
*/
static void
pooler_quickdie(SIGNAL_ARGS)
@@ -2877,7 +3469,9 @@ pooler_quickdie(SIGNAL_ARGS)
exit(2);
}
-
+/*
+ * Note that the pooler received SIGHUP signal.
+ */
static void
pooler_sighup(SIGNAL_ARGS)
{
@@ -2885,8 +3479,13 @@ pooler_sighup(SIGNAL_ARGS)
}
/*
- * Given node identifier, dbname and user name build connection string.
- * Get node connection details from the shared memory node table
+ * build_node_conn_str
+ * Construct a connection string for the specified node.
+ *
+ * Given node OID and pool (which includes dbname and username strings),
+ * build the node connection string.
+ *
+ * May return NULL if the node got deleted, for example.
*/
static char *
build_node_conn_str(Oid node, DatabasePool *dbPool)
@@ -2914,10 +3513,13 @@ build_node_conn_str(Oid node, DatabasePool *dbPool)
}
/*
- * Check all pooled connections, and close which have been released more then
- * PooledConnKeepAlive seconds ago.
- * Return true if shrink operation closed all the connections and pool can be
- * ddestroyed, false if there are still connections or pool is in use.
+ * shrink_pool
+ * Close connections unused for more than PooledConnKeepAlive seconds.
+ *
+ * Returns true if shrink operation closed all the connections and the
+ * whole database pool can be destroyed, false if there are still open
+ * connections (in at least one node pool) or if the pool is in use
+ * (that is, if there are pool agents still referencing this pool).
*/
static bool
shrink_pool(DatabasePool *pool)
@@ -2991,8 +3593,13 @@ shrink_pool(DatabasePool *pool)
/*
- * Scan connection pools and release connections which are idle for long.
- * If pool gets empty after releasing connections it is destroyed.
+ * pools_maintenance
+ * Perform regular maintenance of the connection pools.
+ *
+ * Scan connection pools and release connections which are idle for too
+ * long (longer than PoolConnKeepAlive). If the node pool gets empty
+ * after releasing idle connections it is destroyed (but only if not
+ * used by any pool agent).
*/
static void
pools_maintenance(void)
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 7ad15c7c6a..13b52e802c 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -175,11 +175,9 @@ extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timesta
extern bool pgxc_node_receive(const int conn_count,
PGXCNodeHandle ** connections, struct timeval * timeout);
extern int pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error);
-extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
extern int send_some(PGXCNodeHandle * handle, int len);
extern int pgxc_node_flush(PGXCNodeHandle *handle);
-extern void pgxc_node_flush_read(PGXCNodeHandle *handle);
extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
@@ -202,4 +200,8 @@ extern bool PgxcNodeDiffBackendHandles(List **nodes_alter,
List **nodes_delete, List **nodes_add);
extern void PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter);
extern void HandlePoolerMessages(void);
+
+/* Check health of nodes in the connection pool. */
+extern void PoolPingNodeRecheck(Oid nodeoid);
+
#endif /* PGXCNODE_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 47a54c67b2..3c2d1f4eb2 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -1,15 +1,24 @@
/*-------------------------------------------------------------------------
*
* poolmgr.h
- *
- * Definitions for the Datanode connection pool.
+ * Definitions for the built-in Postgres-XL connection pool.
*
*
* Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
- * src/include/pgxc/poolmgr.h
+ *
+ * XXX Some function take list of nodes, others accept array + nitems.
+ * We should make this more consistent.
+ *
+ * XXX PoolPingNodes is defined on a number of places, including some .c
+ * files. We should define it on one place (pgxcnode.h?) and then include
+ * the header wherever needed.
+ *
+ *
+ * IDENTIFICATION
+ * src/include/pgxc/poolmgr.h
*
*-------------------------------------------------------------------------
*/
@@ -26,7 +35,14 @@
#define MAX_IDLE_TIME 60
-/* Connection pool entry */
+/*
+ * One connection in the pool (to datanode or coordinator).
+ *
+ * Essentially a PGconn+PGcancel, so that we can talk to the remote node
+ * and also forward a cancel request if needed.
+ *
+ * XXX rename to PooledConnection.
+ */
typedef struct
{
time_t released;
@@ -34,33 +50,55 @@ typedef struct
NODE_CANCEL *xc_cancelConn;
} PGXCNodePoolSlot;
-/* Pool of connections to specified pgxc node */
+/*
+ * Pool of open connections to single node (datanode or coordinator).
+ *
+ * All the connections share the same connection string, and are tracked
+ * in a simple array of connections.
+ *
+ * XXX rename to NodePool.
+ * XXX not sure if "size" means "valid entries" or "maximum entries".
+ * XXX use FLEXIBLE_ARRAY_MEMBER
+ * XXX or maybe use simple lists of available/free connections instead?
+ */
typedef struct
{
- Oid nodeoid; /* Node Oid related to this pool */
- char *connstr;
+ Oid nodeoid; /* node Oid related to this pool */
+ char *connstr; /* connection string for all the connections */
int freeSize; /* available connections */
- int size; /* total pool size */
+ int size; /* total pool size (available slots) */
+
+ /* array of open connections (with freeSize available connections) */
PGXCNodePoolSlot **slot;
} PGXCNodePool;
-/* All pools for specified database */
+/*
+ * A group of per-node connection pools (PGXCNodePool), for a particular
+ * database/user combination. We have one PGXCNodePool for each remote
+ * node (datanode or coordinator).
+ *
+ * If there are multiple such combinations (e.g. when there are multiple
+ * users accessing the same database), there will be multiple DatabasePool
+ * entries, organized in a linked list.
+ */
typedef struct databasepool
{
char *database;
char *user_name;
char *pgoptions; /* Connection options */
- HTAB *nodePools; /* Hashtable of PGXCNodePool, one entry for each
- * Coordinator or DataNode */
+ HTAB *nodePools; /* hashtable, one entry per remote node */
MemoryContext mcxt;
struct databasepool *next; /* Reference to next to organize linked list */
time_t oldest_idle;
} DatabasePool;
/*
- * Agent of client session (Pool Manager side)
- * Acts as a session manager, grouping connections together
- * and managing session parameters
+ * Agent, managing a single client session on PoolManager side.
+ *
+ * Is responsible for:
+ *
+ * - tracking which connections are assigned to the session
+ * - managing parameters (GUCs) set in the session
*/
typedef struct
{
@@ -74,20 +112,17 @@ typedef struct
int num_coord_connections;
Oid *dn_conn_oids; /* one for each Datanode */
Oid *coord_conn_oids; /* one for each Coordinator */
- PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
+ PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
} PoolAgent;
+
/*
- * Helper to poll for all pooler sockets
+ * Configuration parameters (GUCs).
*/
-typedef struct pollfd Pollfd;
-
-
extern int PoolConnKeepAlive;
extern int PoolMaintenanceTimeout;
extern int MaxPoolSize;
extern int PoolerPort;
-
extern bool PersistentConnections;
/* Status inquiry functions */
@@ -97,53 +132,48 @@ extern bool IsPGXCPoolerProcess(void);
/* Initialize internal structures */
extern int PoolManagerInit(void);
-/* Destroy internal structures */
-extern int PoolManagerDestroy(void);
-
/*
- * Gracefully close connection to the PoolManager
+ * Gracefully close the PoolManager connection.
*/
extern void PoolManagerDisconnect(void);
-extern char *session_options(void);
/*
- * Reconnect to pool manager
- * This simply does a disconnection followed by a reconnection.
+ * Returns list of options to be propagated to the remote node(s).
*/
-extern void PoolManagerReconnect(void);
+extern char *session_options(void);
-/* Get pooled connections */
+/* Get pooled connections to specified nodes */
extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist,
int **pids);
-/* Clean pool connections */
-extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username);
+/* Clean connections for the specified nodes (for dbname/user). */
+extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+ char *dbname, char *username);
-/* Check consistency of connection information cached in pooler with catalogs */
+/* Check that connections cached in the connection poole match catalogs. */
extern bool PoolManagerCheckConnectionInfo(void);
-/* Reload connection data in pooler and drop all the existing connections of pooler */
+/* Reload connection data in pooler (and close all existing connections). */
extern void PoolManagerReloadConnectionInfo(void);
-/* Refresh connection data in pooler and drop connections of altered nodes in pooler */
+/* Reload connection data in pooler and close connections to modified nodes). */
extern int PoolManagerRefreshConnectionInfo(void);
-/* Send Abort signal to transactions being run */
-extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
-
-/* Return connections back to the pool, for both Coordinator and Datanode connections */
+/* Return all connections (for the session) back to the pool. */
extern void PoolManagerReleaseConnections(bool destroy);
-/* Cancel a running query on Datanodes as well as on other Coordinators */
-extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list);
+/* Send "abort transaction" signal to transactions being run */
+extern int PoolManagerAbortTransactions(char *dbname, char *username,
+ int **proc_pids);
-/* Lock/unlock pool manager */
-extern void PoolManagerLock(bool is_lock);
+/* Cancel a running query on all participating nodes (pg_cancel_backend). */
+extern void PoolManagerCancelQuery(int dn_count, int* dn_list,
+ int co_count, int* co_list);
-/* Do pool health check activity */
+/* Check health of nodes in the connection pool. */
extern void PoolPingNodes(void);
-extern void PoolPingNodeRecheck(Oid nodeoid);
extern bool check_persistent_connections(bool *newval, void **extra,
GucSource source);
+
#endif