You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
|
|
1
(2) |
2
(3) |
3
|
4
|
5
|
6
(2) |
7
(1) |
8
|
9
(4) |
10
|
11
|
12
|
13
|
14
(1) |
15
(1) |
16
(1) |
17
|
18
|
19
|
20
|
21
(1) |
22
|
23
|
24
(2) |
25
|
26
|
27
(2) |
28
(1) |
29
(1) |
30
(22) |
|
|
From: Abbas B. <ga...@us...> - 2011-06-24 18:01:56
|
Project "Postgres-XC". The branch, master has been updated via ff7be6e332b36fc7aad99876bf107e258264a7f1 (commit) from d56caa5e2ac517b83595586987794337c9dea357 (commit) - Log ----------------------------------------------------------------- commit ff7be6e332b36fc7aad99876bf107e258264a7f1 Author: Abbas <abb...@en...> Date: Fri Jun 24 22:59:57 2011 +0500 This patch adds a system in XC to cancel a running query, and flush network buffers of any results data nodes might have sent before cancelling the query. This was required to fix certain issues where coordinator encounters an error while processing rows from data nodes and quits row processing. It then issues a new query and finds an old row description in the network buffer. This can and was crashing the server. To cancel a query a new pooler command 'h' is added. This command is sent to the pooler by the coordinator and the pooler issues PQcancel to the respective data nodes. Cancel request is sent every time coordinator raises an error of level more than ERROR. This commit fixes bug 3306801 diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index c0103b8..d34f002 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -1130,7 +1130,7 @@ slot_deform_tuple(TupleTableSlot *slot, int natts) static void slot_deform_datarow(TupleTableSlot *slot) { - int attnum = slot->tts_tupleDescriptor->natts; + int attnum; int i; int col_count; char *cur = slot->tts_dataRow; @@ -1138,6 +1138,11 @@ slot_deform_datarow(TupleTableSlot *slot) uint16 n16; uint32 n32; + if (slot->tts_tupleDescriptor == NULL || slot->tts_dataRow == NULL) + return; + + attnum = slot->tts_tupleDescriptor->natts; + /* fastpath: exit if values already extracted */ if (slot->tts_nvalid == attnum) return; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index ea30453..2f77f5e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1195,8 +1195,8 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) { - char *msg; - int msg_len; + char *msg; + int msg_len; char msg_type; bool suspended = false; @@ -1327,6 +1327,64 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) /* + * Has the data node sent Ready For Query + */ + +bool +is_data_node_ready(PGXCNodeHandle * conn) +{ + char *msg; + int msg_len; + char msg_type; + bool suspended = false; + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* don't read from from the connection if there is a fatal error */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + return true; + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return false; + + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) + { + case 's': /* PortalSuspended */ + suspended = true; + break; + + case 'Z': /* ReadyForQuery */ + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + int result = suspended ? RESPONSE_SUSPENDED : RESPONSE_COMPLETE; + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; + return true; + } + } + } + /* never happen, but keep compiler quiet */ + return false; +} + +/* * Send BEGIN command to the Datanodes or Coordinators and receive responses */ static int @@ -2453,7 +2511,7 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** if (bytes_needed > COPY_BUFFER_SIZE) { /* First look if data node has sent a error message */ - int read_status = pgxc_node_read_data(primary_handle); + int read_status = pgxc_node_read_data(primary_handle, true); if (read_status == EOF || read_status < 0) { add_error_message(primary_handle, "failed to read data from data node"); @@ -2514,7 +2572,7 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** int to_send = handle->outEnd; /* First look if data node has sent a error message */ - int read_status = pgxc_node_read_data(handle); + int read_status = pgxc_node_read_data(handle, true); if (read_status == EOF || read_status < 0) { add_error_message(handle, "failed to read data from data node"); @@ -2615,7 +2673,7 @@ DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* if (handle_response(handle,combiner) == RESPONSE_EOF) { /* read some extra-data */ - read_status = pgxc_node_read_data(handle); + read_status = pgxc_node_read_data(handle, true); if (read_status < 0) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -2679,30 +2737,9 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, if (primary_handle) { + error = true; if (primary_handle->state == DN_CONNECTION_STATE_COPY_IN || primary_handle->state == DN_CONNECTION_STATE_COPY_OUT) - { - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(primary_handle->outEnd + 1 + 4, primary_handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - primary_handle->outBuffer[primary_handle->outEnd++] = 'c'; - memcpy(primary_handle->outBuffer + primary_handle->outEnd, &nLen, 4); - primary_handle->outEnd += 4; - - /* We need response right away, so send immediately */ - if (pgxc_node_flush(primary_handle) < 0) - { - error = true; - } - } - else - { - error = true; - } + error = DataNodeCopyEnd(primary_handle, false); combiner = CreateResponseCombiner(conn_count + 1, combine_type); error = (pgxc_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error; @@ -2712,30 +2749,9 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, { PGXCNodeHandle *handle = connections[i]; + error = true; if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) - { - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - handle->outBuffer[handle->outEnd++] = 'c'; - memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); - handle->outEnd += 4; - - /* We need response right away, so send immediately */ - if (pgxc_node_flush(handle) < 0) - { - error = true; - } - } - else - { - error = true; - } + error = DataNodeCopyEnd(handle, false); } need_tran = !autocommit || primary_handle || conn_count > 1; @@ -2750,6 +2766,36 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, errmsg("Error while running COPY"))); } +/* + * End copy process on a connection + */ +bool +DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error) +{ + int nLen = htonl(4); + + if (handle == NULL) + return true; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0) + return true; + + if (is_error) + handle->outBuffer[handle->outEnd++] = 'f'; + else + handle->outBuffer[handle->outEnd++] = 'c'; + + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + + /* We need response right away, so send immediately */ + if (pgxc_node_flush(handle) < 0) + return true; + + return false; +} + RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { @@ -3296,7 +3342,9 @@ do_query(RemoteQueryState *node) while (true) { int res; - pgxc_node_receive(1, &primaryconnection, NULL); + if (pgxc_node_receive(1, &primaryconnection, NULL)) + break; + res = handle_response(primaryconnection, node); if (res == RESPONSE_COMPLETE) break; @@ -4248,7 +4296,8 @@ ExecRemoteUtility(RemoteQuery *node) { int i = 0; - pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL); + if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) + break; /* * Handle input from the data nodes. * We do not expect data nodes returning tuples when running utility @@ -4296,7 +4345,9 @@ ExecRemoteUtility(RemoteQuery *node) { int i = 0; - pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL); + if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL)) + break; + while (i < co_conn_count) { int res = handle_response(pgxc_connections->coord_handles[i], remotestate); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index a0b8da4..a2e90ce 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -20,6 +20,7 @@ #include <sys/select.h> #include <sys/time.h> #include <sys/types.h> +#include <sys/ioctl.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -279,21 +280,35 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, int nodenum) * Wait while at least one of specified connections has data available and read * the data into the buffer */ -int +bool pgxc_node_receive(const int conn_count, PGXCNodeHandle ** connections, struct timeval * timeout) { +#define ERROR_OCCURED true +#define NO_ERROR_OCCURED false int i, res_select, nfds = 0; - fd_set readfds; + fd_set readfds; + bool is_msg_buffered; FD_ZERO(&readfds); + + is_msg_buffered = false; + for (i = 0; i < conn_count; i++) + { + /* If connection has a buffered message */ + if (HAS_MESSAGE_BUFFERED(connections[i])) + { + is_msg_buffered = true; + break; + } + } + for (i = 0; i < conn_count; i++) { /* If connection finished sending do not wait input from it */ - if (connections[i]->state == DN_CONNECTION_STATE_IDLE - || HAS_MESSAGE_BUFFERED(connections[i])) + if (connections[i]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[i])) continue; /* prepare select params */ @@ -313,7 +328,11 @@ pgxc_node_receive(const int conn_count, * Return if we do not have connections to receive input */ if (nfds == 0) - return 0; + { + if (is_msg_buffered) + return NO_ERROR_OCCURED; + return ERROR_OCCURED; + } retry: res_select = select(nfds + 1, &readfds, NULL, NULL, timeout); @@ -328,14 +347,16 @@ retry: elog(WARNING, "select() bad file descriptor set"); } elog(WARNING, "select() error: %d", errno); - return errno; + if (errno) + return ERROR_OCCURED; + return NO_ERROR_OCCURED; } if (res_select == 0) { /* Handle timeout */ elog(WARNING, "timeout while waiting for response"); - return EOF; + return ERROR_OCCURED; } /* read data */ @@ -345,7 +366,7 @@ retry: if (FD_ISSET(conn->sock, &readfds)) { - int read_status = pgxc_node_read_data(conn); + int read_status = pgxc_node_read_data(conn, true); if (read_status == EOF || read_status < 0) { @@ -354,26 +375,46 @@ retry: add_error_message(conn, "unexpected EOF on datanode connection"); elog(WARNING, "unexpected EOF on datanode connection"); /* Should we read from the other connections before returning? */ - return EOF; + return ERROR_OCCURED; } } } - return 0; + return NO_ERROR_OCCURED; } +/* + * Is there any data enqueued in the TCP input buffer waiting + * to be read sent by the PGXC node connection + */ + +int +pgxc_node_is_data_enqueued(PGXCNodeHandle *conn) +{ + int ret; + int enqueued; + + if (conn->sock < 0) + return 0; + ret = ioctl(conn->sock, FIONREAD, &enqueued); + if (ret != 0) + return 0; + + return enqueued; +} /* * Read up incoming messages from the PGXC node connection */ int -pgxc_node_read_data(PGXCNodeHandle *conn) +pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error) { int someread = 0; int nread; if (conn->sock < 0) { - add_error_message(conn, "bad socket"); + if (close_if_error) + add_error_message(conn, "bad socket"); return EOF; } @@ -412,7 +453,8 @@ pgxc_node_read_data(PGXCNodeHandle *conn) */ if (conn->inSize - conn->inEnd < 100) { - add_error_message(conn, "can not allocate buffer"); + if (close_if_error) + add_error_message(conn, "can not allocate buffer"); return -1; } } @@ -424,7 +466,8 @@ retry: if (nread < 0) { - elog(DEBUG1, "dnrd errno = %d", errno); + if (close_if_error) + elog(DEBUG1, "dnrd errno = %d", errno); if (errno == EINTR) goto retry; /* Some systems return EAGAIN/EWOULDBLOCK for no data */ @@ -444,19 +487,22 @@ retry: * OK, we are getting a zero read even though select() says ready. This * means the connection has been closed. Cope. */ - add_error_message(conn, - "data node closed the connection unexpectedly\n" - "\tThis probably means the data node terminated abnormally\n" - "\tbefore or while processing the request.\n"); - conn->state = DN_CONNECTION_STATE_ERROR_FATAL; /* No more connection to - * backend */ - closesocket(conn->sock); - conn->sock = NO_SOCKET; - + if (close_if_error) + { + add_error_message(conn, + "data node closed the connection unexpectedly\n" + "\tThis probably means the data node terminated abnormally\n" + "\tbefore or while processing the request.\n"); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; /* No more connection to + * backend */ + closesocket(conn->sock); + conn->sock = NO_SOCKET; + } return -1; } #endif - add_error_message(conn, "could not receive data from server"); + if (close_if_error) + add_error_message(conn, "could not receive data from server"); return -1; } @@ -488,7 +534,8 @@ retry: if (nread == 0) { - elog(DEBUG1, "nread returned 0"); + if (close_if_error) + elog(DEBUG1, "nread returned 0"); return EOF; } @@ -661,6 +708,102 @@ release_handles(void) coord_count = 0; } +/* + * cancel a running query due to error while processing rows + */ +void +cancel_query(void) +{ + int i; + int dn_cancel[NumDataNodes]; + int co_cancel[NumCoords]; + int dn_count = 0; + int co_count = 0; + + if (datanode_count == 0 && coord_count == 0) + return; + + /* Collect Data Nodes handles */ + for (i = 0; i < NumDataNodes; i++) + { + PGXCNodeHandle *handle = &dn_handles[i]; + + if (handle->sock != NO_SOCKET) + { + if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + } + else + { + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + dn_cancel[dn_count++] = handle->nodenum; + } + } + } + } + + /* Collect Coordinator handles */ + for (i = 0; i < NumCoords; i++) + { + PGXCNodeHandle *handle = &co_handles[i]; + + if (handle->sock != NO_SOCKET) + { + if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + } + else + { + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + co_cancel[dn_count++] = handle->nodenum; + } + } + } + } + + PoolManagerCancelQuery(dn_count, dn_cancel, co_count, co_cancel); +} + +/* + * This method won't return until all network buffers are empty + * To ensure all data in all network buffers is read and wasted + */ +void +clear_all_data(void) +{ + int i; + + if (datanode_count == 0 && coord_count == 0) + return; + + /* Collect Data Nodes handles */ + for (i = 0; i < NumDataNodes; i++) + { + PGXCNodeHandle *handle = &dn_handles[i]; + + if (handle->sock != NO_SOCKET && handle->state != DN_CONNECTION_STATE_IDLE) + { + pgxc_node_flush_read(handle); + handle->state = DN_CONNECTION_STATE_IDLE; + } + } + + /* Collect Coordinator handles */ + for (i = 0; i < NumCoords; i++) + { + PGXCNodeHandle *handle = &co_handles[i]; + + if (handle->sock != NO_SOCKET && handle->state != DN_CONNECTION_STATE_IDLE) + { + pgxc_node_flush_read(handle); + handle->state = DN_CONNECTION_STATE_IDLE; + } + } +} /* * Ensure specified amount of data can fit to the incoming buffer and @@ -1224,6 +1367,31 @@ pgxc_node_flush(PGXCNodeHandle *handle) } /* + * This method won't return until network buffer is empty or error occurs + * To ensure all data in network buffers is read and wasted + */ +void +pgxc_node_flush_read(PGXCNodeHandle *handle) +{ + bool is_ready; + int read_result; + + if (handle == NULL) + return; + + while(true) + { + is_ready = is_data_node_ready(handle); + if (is_ready == true) + break; + + read_result = pgxc_node_read_data(handle, false); + if (read_result < 0) + break; + } +} + +/* * Send specified statement down to the PGXC node */ int diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index 79a3776..22dc813 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -435,9 +435,7 @@ pool_flush(PoolPort *port) * If shutting down already, do not call. */ if (!proc_exit_inprogress) - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("could not send data to client: %m"))); + return 0; } /* diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 1b2c4bf..463bd5a 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -104,6 +104,7 @@ static DatabasePool *find_database_pool_to_clean(const char *database, List *co_list); static DatabasePool *remove_database_pool(const char *database, const char *user_name); static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist); +static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist); static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, int node, char client_conn_type); static void agent_release_connections(PoolAgent *agent, List *dn_discard, List *co_discard); static void agent_reset_params(PoolAgent *agent, List *dn_list, List *co_list); @@ -878,17 +879,17 @@ agent_handle_input(PoolAgent * agent, StringInfo s) */ for (;;) { - const char *database = NULL; - const char *user_name = NULL; - const char *set_command; + const char *database = NULL; + const char *user_name = NULL; + const char *set_command; bool is_local; - int datanodecount; - int coordcount; - List *datanodelist = NIL; - List *coordlist = NIL; - int *fds; - int *pids; - int i, len, res; + int datanodecount; + int coordcount; + List *datanodelist = NIL; + List *coordlist = NIL; + int *fds; + int *pids; + int i, len, res; /* * During a pool cleaning, Abort, Connect and Get Connections messages @@ -1001,6 +1002,32 @@ agent_handle_input(PoolAgent * agent, StringInfo s) if (fds) pfree(fds); break; + + case 'h': /* Cancel SQL Command in progress on specified connections */ + /* + * Length of message is caused by: + * - Message header = 4bytes + * - List of datanodes = NumDataNodes * 4bytes (max) + * - List of coordinators = NumCoords * 4bytes (max) + * - Number of Datanodes sent = 4bytes + * - Number of Coordinators sent = 4bytes + */ + pool_getmessage(&agent->port, s, 4 * NumDataNodes + 4 * NumCoords + 12); + datanodecount = pq_getmsgint(s, 4); + for (i = 0; i < datanodecount; i++) + datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4)); + coordcount = pq_getmsgint(s, 4); + /* It is possible that no Coordinators are involved in the transaction */ + for (i = 0; i < coordcount; i++) + coordlist = lappend_int(coordlist, pq_getmsgint(s, 4)); + pq_getmsgend(s); + + cancel_query_on_connections(agent, datanodelist, coordlist); + list_free(datanodelist); + list_free(coordlist); + + break; + case 'r': /* RELEASE CONNECTIONS */ pool_getmessage(&agent->port, s, 4 * NumDataNodes + 4 * NumCoords + 12); datanodecount = pq_getmsgint(s, 4); @@ -1245,6 +1272,61 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) return result; } +/* + * Cancel query + */ +static int +cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist) +{ + int i; + ListCell *nodelist_item; + char errbuf[256]; + int nCount; + bool bRet; + + nCount = 0; + + if (agent == NULL) + return nCount; + + /* Send cancel on Data nodes first */ + foreach(nodelist_item, datanodelist) + { + int node = lfirst_int(nodelist_item); + + if(node <= 0 || node > NumDataNodes) + continue; + + if (agent->dn_connections == NULL) + break; + + bRet = PQcancel((PGcancel *) agent->dn_connections[node - 1]->xc_cancelConn, errbuf, sizeof(errbuf)); + if (bRet != false) + { + nCount++; + } + } + + /* Send cancel to Coordinators too, e.g. if DDL was in progress */ + foreach(nodelist_item, coordlist) + { + int node = lfirst_int(nodelist_item); + + if(node <= 0 || node > NumDataNodes) + continue; + + if (agent->coord_connections == NULL) + break; + + bRet = PQcancel((PGcancel *) agent->coord_connections[node - 1]->xc_cancelConn, errbuf, sizeof(errbuf)); + if (bRet != false) + { + nCount++; + } + } + + return nCount; +} /* * Return connections back to the pool @@ -1262,6 +1344,9 @@ PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* Assert(Handle); + if (dn_ndisc == 0 && co_ndisc == 0) + return; + /* Insert the list of Datanodes in buffer */ n32 = htonl((uint32) dn_ndisc); buf[0] = n32; @@ -1290,6 +1375,52 @@ PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* pool_flush(&Handle->port); } +/* + * Cancel Query + */ +void +PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list) +{ + uint32 n32; + /* + * Buffer contains the list of both Coordinator and Datanodes, as well + * as the number of connections + */ + uint32 buf[2 + dn_count + co_count]; + int i; + + if (Handle == NULL || dn_list == NULL || co_list == NULL) + return; + + if (dn_count == 0 && co_count == 0) + return; + + /* Insert the list of Datanodes in buffer */ + n32 = htonl((uint32) dn_count); + buf[0] = n32; + + for (i = 0; i < dn_count;) + { + n32 = htonl((uint32) dn_list[i++]); + buf[i] = n32; + } + + /* Insert the list of Coordinators in buffer */ + n32 = htonl((uint32) co_count); + buf[dn_count + 1] = n32; + + /* Not necessary to send to pooler a request if there is no Coordinator */ + if (co_count != 0) + { + for (i = dn_count + 1; i < (dn_count + co_count + 1);) + { + n32 = htonl((uint32) co_list[i - (dn_count + 1)]); + buf[++i] = n32; + } + } + pool_putmessage(&Handle->port, 'h', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32)); + pool_flush(&Handle->port); +} /* * Release connections for Datanodes and Coordinators @@ -1950,6 +2081,8 @@ grow_pool(DatabasePool * dbPool, int index, char client_conn_type) break; } + slot->xc_cancelConn = PQgetCancel(slot->conn); + /* Insert at the end of the pool */ nodePool->slot[(nodePool->freeSize)++] = slot; @@ -1968,6 +2101,7 @@ grow_pool(DatabasePool * dbPool, int index, char client_conn_type) static void destroy_slot(PGXCNodePoolSlot *slot) { + PQfreeCancel(slot->xc_cancelConn); PGXCNodeClose(slot->conn); pfree(slot); } diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index b2fab35..60e9cac 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -71,6 +71,9 @@ #include "utils/guc.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif #undef _ @@ -221,6 +224,13 @@ errstart(int elevel, const char *filename, int lineno, */ if (elevel >= ERROR) { +#ifdef PGXC + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + cancel_query(); + clear_all_data(); + } +#endif /* * If we are inside a critical section, all errors become PANIC * errors. See miscadmin.h. @@ -1121,6 +1131,14 @@ elog_finish(int elevel, const char *fmt,...) CHECK_STACK_DEPTH(); +#ifdef PGXC + if (elevel >= ERROR && IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + cancel_query(); + clear_all_data(); + } +#endif + /* * Do errstart() to see if we actually want to report the message. */ diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 7cdb0f6..d864470 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -234,5 +234,4 @@ extern int xactGetCommittedChildren(TransactionId **ptr); extern void xact_redo(XLogRecPtr lsn, XLogRecord *record); extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec); - #endif /* XACT_H */ diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index c2fe884..48d23ca 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -141,6 +141,7 @@ extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Sna extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections); extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file); extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, CombineType combine_type); +extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections); extern int ExecCountSlotsRemoteQuery(RemoteQuery *node); @@ -150,10 +151,8 @@ extern void ExecEndRemoteQuery(RemoteQueryState *step); extern void ExecRemoteUtility(RemoteQuery *node); extern int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner); -#ifdef PGXC -extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, - size_t len); -#endif +extern bool is_data_node_ready(PGXCNodeHandle * conn); +extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len); extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot); extern void BufferConnection(PGXCNodeHandle *conn); diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 8f1eb54..4b66a75 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -28,6 +28,7 @@ /* Connection to data node maintained by Pool Manager */ typedef struct PGconn NODE_CONNECTION; +typedef struct PGcancel NODE_CANCEL; /* Helper structure to access data node from Session */ typedef enum @@ -105,6 +106,9 @@ extern void PGXCNodeCleanAndRelease(int code, Datum arg); extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only); extern void release_handles(void); +extern void cancel_query(void); +extern void clear_all_data(void); + extern int get_transaction_nodes(PGXCNodeHandle ** connections, char client_conn_type, @@ -130,11 +134,14 @@ extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot); extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp); -extern int pgxc_node_receive(const int conn_count, +extern bool pgxc_node_receive(const int conn_count, PGXCNodeHandle ** connections, struct timeval * timeout); -extern int pgxc_node_read_data(PGXCNodeHandle * conn); +extern int pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error); +extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn); + extern int send_some(PGXCNodeHandle * handle, int len); extern int pgxc_node_flush(PGXCNodeHandle *handle); +extern void pgxc_node_flush_read(PGXCNodeHandle *handle); extern int pgxc_all_handles_send_gxid(PGXCNodeAllHandles *pgxc_handles, GlobalTransactionId gxid, bool stop_at_error); extern int pgxc_all_handles_send_query(PGXCNodeAllHandles *pgxc_handles, const char *buffer, bool stop_at_error); diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 4de9e4a..7939768 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -35,6 +35,7 @@ typedef struct { struct timeval released; NODE_CONNECTION *conn; + NODE_CANCEL *xc_cancelConn; } PGXCNodePoolSlot; /* Pool of connections to specified pgxc node */ @@ -149,4 +150,7 @@ extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc /* Return connections back to the pool, for both Coordinator and Datanode connections */ extern void PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* co_discard); +/* Cancel a running query on data nodes as well as on other coordinators */ +extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list); + #endif diff --git a/src/test/regress/expected/domain_1.out b/src/test/regress/expected/domain_1.out index 07808af..02f1556 100644 --- a/src/test/regress/expected/domain_1.out +++ b/src/test/regress/expected/domain_1.out @@ -48,8 +48,7 @@ ERROR: value too long for type character varying(5) INSERT INTO basictest values ('88', 'haha', 'short', '123.1212'); -- Truncate numeric -- Test copy COPY basictest (testvarchar) FROM stdin; -- fail -ERROR: value too long for type character varying(5) -CONTEXT: COPY basictest, line 1, column testvarchar: "notsoshorttext" +ERROR: Error while running COPY COPY basictest (testvarchar) FROM stdin; select * from basictest order by 1, 2, 3, 4; testint4 | testtext | testvarchar | testnumeric @@ -129,8 +128,7 @@ select testint4arr[1], testchar4arr[2:2] from domarrtest order by 1, 2; COPY domarrtest FROM stdin; COPY domarrtest FROM stdin; -- fail -ERROR: value too long for type character varying(4) -CONTEXT: COPY domarrtest, line 1, column testchar4arr: "{qwerty,w,e}" +ERROR: Error while running COPY select * from domarrtest order by 1, 2; testint4arr | testchar4arr ---------------+--------------------- @@ -174,8 +172,7 @@ INSERT INTO nulltest values ('a', 'b', 'c', NULL, 'd'); -- Good COPY nulltest FROM stdin; --fail ERROR: Error while running COPY COPY nulltest FROM stdin; --fail -ERROR: domain dcheck does not allow null values -CONTEXT: COPY nulltest, line 1, column col5: null input +ERROR: Error while running COPY -- Last row is bad COPY nulltest FROM stdin; ERROR: Error while running COPY diff --git a/src/test/regress/expected/xc_distkey.out b/src/test/regress/expected/xc_distkey.out index d050b27..819952a 100644 --- a/src/test/regress/expected/xc_distkey.out +++ b/src/test/regress/expected/xc_distkey.out @@ -451,15 +451,15 @@ select * from ts_tab order by a; (2 rows) select * from ts_tab where a = 'May 10, 2011 00:01:02.03'; - a ------------------------- - 2011-05-10 00:01:02.03 + a +----------------------------- + Tue May 10 00:01:02.03 2011 (1 row) select * from ts_tab where a = 'August 14, 2001 23:59:59.99'; - a ------------------------- - 2001-08-14 23:59:59.99 + a +----------------------------- + Tue Aug 14 23:59:59.99 2001 (1 row) create table in_tab(a interval) distribute by modulo(a); @@ -517,15 +517,15 @@ select * from atim_tab order by a; (2 rows) select * from atim_tab where a = abstime('May 10, 2011 00:01:02.03'); - a ------------------------- - 2011-05-10 12:01:02+05 + a +------------------------------ + Tue May 10 00:01:02 2011 PDT (1 row) select * from atim_tab where a = abstime('Jun 23, 2001 23:59:59.99'); - a ------------------------- - 2001-06-24 11:59:59+05 + a +------------------------------ + Sat Jun 23 23:59:59 2001 PDT (1 row) create table rtim_tab(a reltime) distribute by modulo(a); @@ -563,13 +563,13 @@ select * from date_tab order by a; select * from date_tab where a = 'May 10, 2011'; a ------------ - 2011-05-10 + 05-10-2011 (1 row) select * from date_tab where a = 'August 23, 2001'; a ------------ - 2001-08-23 + 08-23-2001 (1 row) create table tstz_tab(a timestamp with time zone) distribute by modulo(a); @@ -583,15 +583,15 @@ select * from tstz_tab order by a; (2 rows) select * from tstz_tab where a = 'May 10, 2011 00:01:02.03 PST'; - a ---------------------------- - 2011-05-10 13:01:02.03+05 + a +--------------------------------- + Tue May 10 01:01:02.03 2011 PDT (1 row) select * from tstz_tab where a = 'Jun 23, 2001 23:59:59.99 PST'; - a ---------------------------- - 2001-06-24 12:59:59.99+05 + a +--------------------------------- + Sun Jun 24 00:59:59.99 2001 PDT (1 row) create table tstz_tab_h(a timestamp with time zone) distribute by hash(a); @@ -605,14 +605,14 @@ select * from tstz_tab_h order by a; (2 rows) select * from tstz_tab_h where a = 'May 10, 2011 00:01:02.03 PST'; - a ---------------------------- - 2011-05-10 13:01:02.03+05 + a +--------------------------------- + Tue May 10 01:01:02.03 2011 PDT (1 row) select * from tstz_tab_h where a = 'Jun 23, 2001 23:59:59.99 PST'; - a ---------------------------- - 2001-06-24 12:59:59.99+05 + a +--------------------------------- + Sun Jun 24 00:59:59.99 2001 PDT (1 row) diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 658f930..6b58aa7 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -42,7 +42,7 @@ test: comments test: geometry #After supporting other data types as distribution key, this test case crashes the server #Bug ID 3306801 tracks this crash -#test: horology +test: horology test: oidjoins test: type_sanity test: opr_sanity ----------------------------------------------------------------------- Summary of changes: src/backend/access/common/heaptuple.c | 7 +- src/backend/pgxc/pool/execRemote.c | 159 ++++++++++++++-------- src/backend/pgxc/pool/pgxcnode.c | 218 ++++++++++++++++++++++++++---- src/backend/pgxc/pool/poolcomm.c | 4 +- src/backend/pgxc/pool/poolmgr.c | 154 ++++++++++++++++++++-- src/backend/utils/error/elog.c | 18 +++ src/include/access/xact.h | 1 - src/include/pgxc/execRemote.h | 7 +- src/include/pgxc/pgxcnode.h | 11 ++- src/include/pgxc/poolmgr.h | 4 + src/test/regress/expected/domain_1.out | 9 +- src/test/regress/expected/xc_distkey.out | 52 ++++---- src/test/regress/serial_schedule | 2 +- 13 files changed, 513 insertions(+), 133 deletions(-) hooks/post-receive -- Postgres-XC |
From: Pavan D. <pa...@us...> - 2011-06-24 08:30:26
|
Project "Postgres-XC". The branch, master has been updated via d56caa5e2ac517b83595586987794337c9dea357 (commit) via 097c3c3816c410c6b570c6ef9aa656d4e1f9da2e (commit) via bc8d2c2e0127a90a7f5b01eb3a9be2673c2b4c04 (commit) via a6b077003d974ba5ab612d557eab811d9efc934b (commit) via ad889ead370a7061b9fa57d3b8ce8816b8c251f4 (commit) via b72426e3b1c0c13cd710781a1ff6fd65a96e82d8 (commit) via 9db03183fe491e60dda1a6a5b36b44c55149e077 (commit) via 3cbf503a660e19f5c48c57d3ecd4a746a468cd68 (commit) via 6bbdc5b5befa3ef1f6fbb7a5548b8aa7891873d6 (commit) via 246072c6301bf3e38331ee49e4ff9bd4bd42b9a4 (commit) from 2a828017d88ff64453b37771337646454316269c (commit) - Log ----------------------------------------------------------------- commit d56caa5e2ac517b83595586987794337c9dea357 Merge: 2a82801 097c3c3 Author: Pavan Deolasee <pav...@gm...> Date: Fri Jun 24 13:47:34 2011 +0530 Merge branch 'pgxc-barrier-rebase' into PGXC-master commit 097c3c3816c410c6b570c6ef9aa656d4e1f9da2e Author: Pavan Deolasee <pav...@gm...> Date: Fri Jun 24 13:44:20 2011 +0530 Change the recovery_barrier_id parameter in recovery.conf to recovery_target_barrier to be consistent with other names. Add a sample "recovery_target_barrier" to recovery.conf.sample diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample index 722c7d6..855c113 100644 --- a/src/backend/access/transam/recovery.conf.sample +++ b/src/backend/access/transam/recovery.conf.sample @@ -68,15 +68,18 @@ # If you want to stop rollforward at a specific point, you # must set a recovery target. # -# You may set a recovery target either by transactionId, or -# by timestamp. Recovery may either include or exclude the -# transaction(s) with the recovery target value (ie, stop either -# just after or just before the given target, respectively). +# You may set a recovery target either by transactionId, +# by timestamp or by barrier id. Recovery may either include or exclude the +# transaction(s) with the recovery target value in case of timestamp or +# transactionId (ie, stop either just after or just before the given target, +# respectively). In case of barrier, the recovery stops exactly at that point # #recovery_target_time = '' # e.g. '2004-07-14 22:39:00 EST' # #recovery_target_xid = '' # +#recovery_target_barrier = '' +# #recovery_target_inclusive = 'true' # # diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ebbe6f0..71ee729 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5237,6 +5237,13 @@ readRecoveryCommandFile(void) (errmsg("recovery_target_time = '%s'", timestamptz_to_str(recoveryTargetTime)))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_target_barrier") == 0) + { + recoveryTarget = RECOVERY_TARGET_BARRIER; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else if (strcmp(tok1, "recovery_target_inclusive") == 0) { /* @@ -5249,13 +5256,6 @@ readRecoveryCommandFile(void) ereport(DEBUG2, (errmsg("recovery_target_inclusive = %s", tok2))); } -#ifdef PGXC - else if (strcmp(tok1, "recovery_barrier_id") == 0) - { - recoveryTarget = RECOVERY_TARGET_BARRIER; - recoveryTargetBarrierId = pstrdup(tok2); - } -#endif else if (strcmp(tok1, "standby_mode") == 0) { if (!parse_bool(tok2, &StandbyMode)) @@ -5279,13 +5279,6 @@ readRecoveryCommandFile(void) (errmsg("trigger_file = '%s'", TriggerFile))); } -#ifdef PGXC - else if (strcmp(tok1, "recovery_barrier_id") == 0) - { - recoveryTarget = true; - recoveryTargetBarrierId = pstrdup(tok2); - } -#endif else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", commit bc8d2c2e0127a90a7f5b01eb3a9be2673c2b4c04 Author: Pavan Deolasee <pav...@gm...> Date: Thu Jun 23 13:28:10 2011 +0530 If the barrier id not specified in the CREATE BARRIER command, auto-generate a barrier id based on node_id and the current timestamp of the coordinator diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 32ff484..9512cbc 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -15,6 +15,7 @@ */ #include "postgres.h" +#include "access/gtm.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "pgxc/barrier.h" @@ -139,18 +140,25 @@ ProcessCreateBarrierExecute(const char *id) static const char * generate_barrier_id(const char *id) { + char genid[1024]; + TimestampTz ts; + /* - * TODO If the caller can passed a NULL value, generate an id which is - * guaranteed to be unique across the cluster. We can use a combination of - * the coordinator node id and a timestamp. This may not be complete if we - * support changing coordinator ids without initdb or the system clocks are - * modified. - * - * Another option would be to let the GTM issue globally unique barrier - * IDs (GTM-timestamp based). For the time being, we leave it to the user - * to come up with an unique identifier. + * If the caller can passed a NULL value, generate an id which is + * guaranteed to be unique across the cluster. We use a combination of + * the coordinator node id and current timestamp. */ - return id ? id : pstrdup("dummy_barrier_id"); + + if (id) + return id; + + ts = GetCurrentTimestamp(); +#ifdef HAVE_INT64_TIMESTAMP + sprintf(genid, "%d_"INT64_FORMAT, PGXCNodeId, ts); +#else + sprintf(genid, "%d_%.0f", PGXCNodeId, ts); +#endif + return pstrdup(genid); } static PGXCNodeAllHandles * commit a6b077003d974ba5ab612d557eab811d9efc934b Author: Michael P <mic...@us...> Date: Mon Jun 6 16:35:26 2011 +0900 Correction of spelling mistakes Addition of a couple of compilation flags forgotten. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6c93c21..ebbe6f0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -39,7 +39,9 @@ #include "funcapi.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#ifdef PGXC #include "pgxc/barrier.h" +#endif #include "pgstat.h" #include "postmaster/bgwriter.h" #include "replication/walreceiver.h" @@ -4371,6 +4373,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) snprintf(buffer, sizeof(buffer), "%s%u\t%s\t%s %s\n", @@ -4379,6 +4382,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", recoveryTargetBarrierId); +#endif else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -5492,24 +5496,26 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) return false; record_info = record->xl_info & ~XLR_INFO_MASK; +#ifdef PGXC if (record->xl_rmid == RM_XACT_ID) { - if (record_info == XLOG_XACT_COMMIT) - { - xl_xact_commit *recordXactCommitData; +#endif + if (record_info == XLOG_XACT_COMMIT) + { + xl_xact_commit *recordXactCommitData; - recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); - recordXtime = recordXactCommitData->xact_time; - } - else if (record_info == XLOG_XACT_ABORT) - { - xl_xact_abort *recordXactAbortData; + recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); + recordXtime = recordXactCommitData->xact_time; + } + else if (record_info == XLOG_XACT_ABORT) + { + xl_xact_abort *recordXactAbortData; - recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); - recordXtime = recordXactAbortData->xact_time; - } + recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); + recordXtime = recordXactAbortData->xact_time; } #ifdef PGXC + } /* end if (record->xl_rmid == RM_XACT_ID) */ else if (record->xl_rmid == RM_BARRIER_ID) { if (record_info == XLOG_BARRIER_CREATE) @@ -5883,10 +5889,12 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to %s", timestamptz_to_str(recoveryTargetTime)))); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) ereport(LOG, (errmsg("starting point-in-time recovery to barrier %s", (recoveryTargetBarrierId)))); +#endif else ereport(LOG, (errmsg("starting archive recovery"))); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index d1dad1c..01b0f51 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2362,6 +2362,9 @@ _equalValue(Value *a, Value *b) } #ifdef PGXC +/* + * stuff from barrier.h + */ static bool _equalBarrierStmt(BarrierStmt *a, BarrierStmt *b) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 57a5c2b..94b2cd8 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -7008,7 +7008,6 @@ opt_barrier_id: $$ = NULL; } ; - /* PGXC_END */ /***************************************************************************** diff --git a/src/backend/pgxc/barrier/Makefile b/src/backend/pgxc/barrier/Makefile index d80bbec..9505889 100644 --- a/src/backend/pgxc/barrier/Makefile +++ b/src/backend/pgxc/barrier/Makefile @@ -1,7 +1,7 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for pool +# Makefile for barrier # # Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation # diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 1b44f36..32ff484 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -44,7 +44,7 @@ extern void ProcessCreateBarrierExecute(const char *id); * while all other backend starting a 2PC will grab the lock in shared * mode. So as long as we hold the exclusive lock, no other backend start a * new 2PC and there can not be any 2PC in-progress. This technique would - * rely on assumption that an exclsuive lock requester is not starved by + * rely on assumption that an exclusive lock requester is not starved by * share lock requesters. * * Note: To ensure that the 2PC are not blocked for a long time, we should @@ -76,7 +76,7 @@ ProcessCreateBarrierPrepare(const char *id) } /* - * Mark the completetion of an on-going barrier. We must have remembered the + * Mark the completion of an on-going barrier. We must have remembered the * barrier ID when we received the CREATE BARRIER PREPARE command */ void @@ -103,7 +103,7 @@ ProcessCreateBarrierEnd(const char *id) } /* - * Execute the CREATE BARRIER comamnd. Write a BARRIER WAL record and flush the + * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the * WAL buffers to disk before returning to the caller. Writing the WAL record * does not guarantee successful completion of the barrier command. */ @@ -140,15 +140,15 @@ static const char * generate_barrier_id(const char *id) { /* - * TODO If the caller can passeed a NULL value, generate an id which is + * TODO If the caller can passed a NULL value, generate an id which is * guaranteed to be unique across the cluster. We can use a combination of * the coordinator node id and a timestamp. This may not be complete if we * support changing coordinator ids without initdb or the system clocks are * modified. * * Another option would be to let the GTM issue globally unique barrier - * IDs. For the time being, we leave it to the user to come up with an - * unique identifier + * IDs (GTM-timestamp based). For the time being, we leave it to the user + * to come up with an unique identifier. */ return id ? id : pstrdup("dummy_barrier_id"); } @@ -326,7 +326,7 @@ PrepareBarrier(const char *id) */ LWLockAcquire(BarrierLock, LW_EXCLUSIVE); - elog(DEBUG2, "Disabled 2PC commits origniating at the diriving coordinator"); + elog(DEBUG2, "Disabled 2PC commits originating at the driving coordinator"); /* * TODO Start a timer to cancel the barrier request in case of a timeout @@ -375,7 +375,7 @@ ExecuteBarrier(const char *id) if (handle->state != DN_CONNECTION_STATE_IDLE) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER PREPARE request " + errmsg("Failed to send CREATE BARRIER EXECUTE request " "to the node"))); barrier_idlen = strlen(id) + 1; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 441a625..f8a5c17 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1931,7 +1931,7 @@ PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, * We should acquire the BarrierLock in SHARE mode here to ensure that * there are no in-progress barrier at this point. This mechanism would * work as long as LWLock mechanism does not starve a EXCLUSIVE lock - * requesster + * requester */ LWLockAcquire(BarrierLock, LW_SHARED); res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d12e81b..ad3eb2e 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -489,7 +489,7 @@ standard_ProcessUtility(Node *parsetree, * * XXX We call FinishPreparedTransaction inside * PGXCNodeCommitPrepared if we are doing a local - * operation. This is convinient because we want to + * operation. This is convenient because we want to * hold on to the BarrierLock until local transaction * is committed too. * diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index f276a1f..591fb53 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,8 +184,11 @@ typedef enum { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME, + RECOVERY_TARGET_TIME +#ifdef PGXC + , RECOVERY_TARGET_BARRIER +#endif } RecoveryTargetType; extern XLogRecPtr XactLastRecEnd; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 7ba9208..22b3b75 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2302,7 +2302,6 @@ typedef struct BarrierStmt NodeTag type; const char *id; /* User supplied barrier id, if any */ } BarrierStmt; - #endif /* ---------------------- commit ad889ead370a7061b9fa57d3b8ce8816b8c251f4 Author: Pavan Deolasee <pav...@gm...> Date: Thu May 5 17:00:04 2011 +0530 Stop at the appropriate berrier record and set the last time correctly diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6800e68..6c93c21 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5548,14 +5548,16 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) #ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) { - if ((record->xl_rmid != RM_BARRIER_ID) || - (record_info != XLOG_BARRIER_CREATE)) - return false; - - ereport(DEBUG2, - (errmsg("checking if barrier record matches the target barrier"))); - if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) - stopsAtThisBarrier = true; + stopsHere = false; + if ((record->xl_rmid == RM_BARRIER_ID) && + (record_info == XLOG_BARRIER_CREATE)) + { + ereport(DEBUG2, + (errmsg("checking if barrier record matches the target " + "barrier"))); + if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) + stopsAtThisBarrier = true; + } } #endif else commit b72426e3b1c0c13cd710781a1ff6fd65a96e82d8 Author: Pavan Deolasee <pav...@gm...> Date: Wed May 4 16:43:02 2011 +0530 WAL log the barrier creation activity on the local coordinator. Also fix some of the bugs in the recovery code. This code was not tasted previously and there were some changes after the 9.0 merge diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2a465e3..6800e68 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4371,6 +4371,14 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) + snprintf(buffer, sizeof(buffer), + "%s%u\t%s\t%s %s\n", + (srcfd < 0) ? "" : "\n", + parentTLI, + xlogfname, + recoveryStopAfter ? "after" : "before", + recoveryTargetBarrierId); else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -5240,7 +5248,7 @@ readRecoveryCommandFile(void) #ifdef PGXC else if (strcmp(tok1, "recovery_barrier_id") == 0) { - recoveryTarget = true; + recoveryTarget = RECOVERY_TARGET_BARRIER; recoveryTargetBarrierId = pstrdup(tok2); } #endif @@ -5468,7 +5476,7 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) { bool stopsHere; #ifdef PGXC - bool stopsAtThisBarrier; + bool stopsAtThisBarrier = false; char *recordBarrierId; #endif uint8 record_info; @@ -5482,25 +5490,34 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (record->xl_rmid != RM_XACT_ID) #endif return false; + record_info = record->xl_info & ~XLR_INFO_MASK; - if (record_info == XLOG_XACT_COMMIT) + if (record->xl_rmid == RM_XACT_ID) { - xl_xact_commit *recordXactCommitData; + if (record_info == XLOG_XACT_COMMIT) + { + xl_xact_commit *recordXactCommitData; - recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); - recordXtime = recordXactCommitData->xact_time; - } - else if (record_info == XLOG_XACT_ABORT) - { - xl_xact_abort *recordXactAbortData; + recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); + recordXtime = recordXactCommitData->xact_time; + } + else if (record_info == XLOG_XACT_ABORT) + { + xl_xact_abort *recordXactAbortData; - recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); - recordXtime = recordXactAbortData->xact_time; + recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); + recordXtime = recordXactAbortData->xact_time; + } } #ifdef PGXC - else if (record_info == XLOG_BARRIER_CREATE) + else if (record->xl_rmid == RM_BARRIER_ID) { - recordBarrierId = (char *) XLogRecGetData(record); + if (record_info == XLOG_BARRIER_CREATE) + { + recordBarrierId = (char *) XLogRecGetData(record); + ereport(DEBUG2, + (errmsg("processing barrier xlog record for %s", recordBarrierId))); + } } #endif else @@ -5529,8 +5546,14 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) *includeThis = recoveryTargetInclusive; } #ifdef PGXC - else if (recoveryTargetBarrierId) + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) { + if ((record->xl_rmid != RM_BARRIER_ID) || + (record_info != XLOG_BARRIER_CREATE)) + return false; + + ereport(DEBUG2, + (errmsg("checking if barrier record matches the target barrier"))); if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) stopsAtThisBarrier = true; } @@ -5858,6 +5881,10 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to %s", timestamptz_to_str(recoveryTargetTime)))); + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) + ereport(LOG, + (errmsg("starting point-in-time recovery to barrier %s", + (recoveryTargetBarrierId)))); else ereport(LOG, (errmsg("starting archive recovery"))); diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 3e1d7cc..1b44f36 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -414,6 +414,18 @@ ExecuteBarrier(const char *id) /* * Also WAL log the BARRIER locally and flush the WAL buffers to disk */ + { + XLogRecData rdata[1]; + XLogRecPtr recptr; + + rdata[0].data = (char *) id; + rdata[0].len = strlen(id) + 1; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); + XLogFlush(recptr); + } } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 27e7f40..f276a1f 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,7 +184,8 @@ typedef enum { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME + RECOVERY_TARGET_TIME, + RECOVERY_TARGET_BARRIER } RecoveryTargetType; extern XLogRecPtr XactLastRecEnd; commit 9db03183fe491e60dda1a6a5b36b44c55149e077 Author: Pavan Deolasee <pav...@gm...> Date: Tue Apr 26 19:54:09 2011 +0530 Rearrange the 2PC commit code so that we can commit the local transaction after releasing the barrier lock. diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 5ee876d..441a625 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -2035,7 +2035,7 @@ finish: * This avoid to have any additional interaction with GTM when making a 2PC transaction. */ void -PGXCNodeCommitPrepared(char *gid, bool isTopLevel) +PGXCNodeCommitPrepared(char *gid) { int res = 0; int res_gtm = 0; @@ -2136,17 +2136,11 @@ finish: * If remote connection is a Coordinator type, the commit prepared has to be done locally * if and only if the Coordinator number was in the node list received from GTM. */ - if (operation_local || IsConnFromCoord()) - { - PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + if (operation_local) FinishPreparedTransaction(gid, true); - } - /* - * Release the barrier lock now so that pending barriers can get moving - */ LWLockRelease(BarrierLock); - return; + return; } /* @@ -2191,9 +2185,11 @@ finish: /* * Rollback prepared transaction on Datanodes involved in the current transaction + * + * Return whether or not a local operation required. */ -void -PGXCNodeRollbackPrepared(char *gid, bool isTopLevel) +bool +PGXCNodeRollbackPrepared(char *gid) { int res = 0; int res_gtm = 0; @@ -2273,17 +2269,7 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not rollback prepared transaction on Datanodes"))); - /* - * Local coordinator rollbacks if involved in PREPARE - * If remote connection is a Coordinator type, the commit prepared has to be done locally also. - * This works for both Datanodes and Coordinators. - */ - if (operation_local || IsConnFromCoord()) - { - PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); - FinishPreparedTransaction(gid, false); - } - return; + return operation_local; } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bc6c630..d12e81b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -59,6 +59,7 @@ #ifdef PGXC #include "pgxc/barrier.h" +#include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/pgxc.h" #include "pgxc/planner.h" @@ -479,32 +480,58 @@ standard_ProcessUtility(Node *parsetree, break; case TRANS_STMT_COMMIT_PREPARED: + PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + PreventCommandDuringRecovery("COMMIT PREPARED"); #ifdef PGXC /* * If a COMMIT PREPARED message is received from another Coordinator, * Don't send it down to Datanodes. + * + * XXX We call FinishPreparedTransaction inside + * PGXCNodeCommitPrepared if we are doing a local + * operation. This is convinient because we want to + * hold on to the BarrierLock until local transaction + * is committed too. + * */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - PGXCNodeCommitPrepared(stmt->gid, isTopLevel); -#else - PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); - PreventCommandDuringRecovery("COMMIT PREPARED"); + PGXCNodeCommitPrepared(stmt->gid); + else if (IsConnFromCoord()) + { + /* + * A local Coordinator always commits if involved in Prepare. + * 2PC file is created and flushed if a DDL has been involved in the transaction. + * If remote connection is a Coordinator type, the commit prepared has to be done locally + * if and only if the Coordinator number was in the node list received from GTM. + */ +#endif FinishPreparedTransaction(stmt->gid, true); +#ifdef PGXC + } #endif break; case TRANS_STMT_ROLLBACK_PREPARED: + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); + PreventCommandDuringRecovery("ROLLBACK PREPARED"); #ifdef PGXC /* * If a ROLLBACK PREPARED message is received from another Coordinator, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - PGXCNodeRollbackPrepared(stmt->gid, isTopLevel); -#else - PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); - PreventCommandDuringRecovery("ROLLBACK PREPARED"); - FinishPreparedTransaction(gid, false); + operation_local = PGXCNodeRollbackPrepared(stmt->gid); + /* + * Local coordinator rollbacks if involved in PREPARE + * If remote connection is a Coordinator type, the commit prepared has to be done locally also. + * This works for both Datanodes and Coordinators. + */ + if (operation_local || IsConnFromCoord()) + { +#endif + FinishPreparedTransaction(stmt->gid, false); +#ifdef PGXC + } #endif break; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 9765632..983a126 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -128,8 +128,8 @@ extern void PGXCNodeSetBeginQuery(char *query_string); extern void PGXCNodeCommit(bool bReleaseHandles); extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); -extern void PGXCNodeRollbackPrepared(char *gid, bool isTopLevel); -extern void PGXCNodeCommitPrepared(char *gid, bool isTopLevel); +extern bool PGXCNodeRollbackPrepared(char *gid); +extern void PGXCNodeCommitPrepared(char *gid); extern bool PGXCNodeIsImplicit2PC(bool *prepare_local_coord); extern int PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid); extern void PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, commit 3cbf503a660e19f5c48c57d3ecd4a746a468cd68 Author: Pavan Deolasee <pav...@gm...> Date: Mon Apr 25 17:22:01 2011 +0530 Merge branch 'PGXC-master' into pgxc-barrier Conflicts: src/backend/access/transam/xlog.c src/backend/parser/gram.y src/backend/pgxc/pool/execRemote.c src/backend/tcop/utility.c diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 368cd69..2a465e3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5237,6 +5237,13 @@ readRecoveryCommandFile(void) ereport(DEBUG2, (errmsg("recovery_target_inclusive = %s", tok2))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_barrier_id") == 0) + { + recoveryTarget = true; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else if (strcmp(tok1, "standby_mode") == 0) { if (!parse_bool(tok2, &StandbyMode)) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 7ec9491..57a5c2b 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -472,7 +472,8 @@ static TypeName *TableFuncTypeName(List *columns); */ /* ordinary key words in alphabetical order */ -/* PGXC - added REPLICATION, DISTRIBUTE, MODULO, BARRIER and HASH */ +/* PGXC - added DISTRIBUTE, DIRECT, HASH, REPLICATION, ROUND ROBIN, + * COORDINATOR, CLEAN, MODULO, NODE, BARRIER */ %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION @@ -11022,7 +11023,8 @@ ColLabel: IDENT { $$ = $1; } /* "Unreserved" keywords --- available for use as any kind of name. */ -/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO, BARRIER */ +/* PGXC - added DISTRIBUTE, DIRECT, HASH, REPLICATION, ROUND ROBIN, + * COORDINATOR, CLEAN, MODULO, NODE, BARRIER */ unreserved_keyword: ABORT_P | ABSOLUTE_P commit 6bbdc5b5befa3ef1f6fbb7a5548b8aa7891873d6 Author: Pavan Deolasee <pav...@gm...> Date: Mon Apr 18 13:41:47 2011 +0530 Add synchrnization at the commit time diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c297003..5ee876d 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -17,6 +17,7 @@ #include <time.h> #include "postgres.h" +#include "access/twophase.h" #include "access/gtm.h" #include "access/xact.h" #include "catalog/pg_type.h" @@ -1924,9 +1925,23 @@ PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, goto finish; } + /* + * Barrier: + * + * We should acquire the BarrierLock in SHARE mode here to ensure that + * there are no in-progress barrier at this point. This mechanism would + * work as long as LWLock mechanism does not starve a EXCLUSIVE lock + * requesster + */ + LWLockAcquire(BarrierLock, LW_SHARED); res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, pgxc_connections, gid, is_commit); + /* + * Release the BarrierLock. + */ + LWLockRelease(BarrierLock); + finish: /* Clear nodes, signals are clear */ if (!autocommit) @@ -2019,8 +2034,8 @@ finish: * or not but send the message to all of them. * This avoid to have any additional interaction with GTM when making a 2PC transaction. */ -bool -PGXCNodeCommitPrepared(char *gid) +void +PGXCNodeCommitPrepared(char *gid, bool isTopLevel) { int res = 0; int res_gtm = 0; @@ -2070,7 +2085,15 @@ PGXCNodeCommitPrepared(char *gid) /* * Commit here the prepared transaction to all Datanodes and Coordinators * If necessary, local Coordinator Commit is performed after this DataNodeCommitPrepared. + * + * BARRIER: + * + * Take the BarrierLock in SHARE mode to synchronize on in-progress + * barriers. We should hold on to the lock until the local prepared + * transaction is also committed */ + LWLockAcquire(BarrierLock, LW_SHARED); + res = pgxc_node_commit_prepared(gxid, prepared_gxid, pgxc_handles, gid); finish: @@ -2096,6 +2119,7 @@ finish: free(coordinators); pfree_pgxc_all_handles(pgxc_handles); + if (res_gtm < 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -2106,7 +2130,23 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not commit prepared transaction on data nodes"))); - return operation_local; + /* + * A local Coordinator always commits if involved in Prepare. + * 2PC file is created and flushed if a DDL has been involved in the transaction. + * If remote connection is a Coordinator type, the commit prepared has to be done locally + * if and only if the Coordinator number was in the node list received from GTM. + */ + if (operation_local || IsConnFromCoord()) + { + PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + FinishPreparedTransaction(gid, true); + } + + /* + * Release the barrier lock now so that pending barriers can get moving + */ + LWLockRelease(BarrierLock); + return; } /* @@ -2151,11 +2191,9 @@ finish: /* * Rollback prepared transaction on Datanodes involved in the current transaction - * - * Return whether or not a local operation required. */ -bool -PGXCNodeRollbackPrepared(char *gid) +void +PGXCNodeRollbackPrepared(char *gid, bool isTopLevel) { int res = 0; int res_gtm = 0; @@ -2235,7 +2273,17 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not rollback prepared transaction on Datanodes"))); - return operation_local; + /* + * Local coordinator rollbacks if involved in PREPARE + * If remote connection is a Coordinator type, the commit prepared has to be done locally also. + * This works for both Datanodes and Coordinators. + */ + if (operation_local || IsConnFromCoord()) + { + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); + FinishPreparedTransaction(gid, false); + } + return; } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d8f697e..bc6c630 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -485,23 +485,11 @@ standard_ProcessUtility(Node *parsetree, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - operation_local = PGXCNodeCommitPrepared(stmt->gid); -#endif + PGXCNodeCommitPrepared(stmt->gid, isTopLevel); +#else PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); PreventCommandDuringRecovery("COMMIT PREPARED"); -#ifdef PGXC - /* - * A local Coordinator always commits if involved in Prepare. - * 2PC file is created and flushed if a DDL has been involved in the transaction. - * If remote connection is a Coordinator type, the commit prepared has to be done locally - * if and only if the Coordinator number was in the node list received from GTM. - */ - if (operation_local || IsConnFromCoord()) - { -#endif FinishPreparedTransaction(stmt->gid, true); -#ifdef PGXC - } #endif break; @@ -512,22 +500,11 @@ standard_ProcessUtility(Node *parsetree, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - operation_local = PGXCNodeRollbackPrepared(stmt->gid); -#endif + PGXCNodeRollbackPrepared(stmt->gid, isTopLevel); +#else PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); PreventCommandDuringRecovery("ROLLBACK PREPARED"); -#ifdef PGXC - /* - * Local coordinator rollbacks if involved in PREPARE - * If remote connection is a Coordinator type, the commit prepared has to be done locally also. - * This works for both Datanodes and Coordinators. - */ - if (operation_local || IsConnFromCoord()) - { -#endif - FinishPreparedTransaction(stmt->gid, false); -#ifdef PGXC - } + FinishPreparedTransaction(gid, false); #endif break; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 076100b..9765632 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -128,8 +128,8 @@ extern void PGXCNodeSetBeginQuery(char *query_string); extern void PGXCNodeCommit(bool bReleaseHandles); extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); -extern bool PGXCNodeRollbackPrepared(char *gid); -extern bool PGXCNodeCommitPrepared(char *gid); +extern void PGXCNodeRollbackPrepared(char *gid, bool isTopLevel); +extern void PGXCNodeCommitPrepared(char *gid, bool isTopLevel); extern bool PGXCNodeIsImplicit2PC(bool *prepare_local_coord); extern int PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid); extern void PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, commit 246072c6301bf3e38331ee49e4ff9bd4bd42b9a4 Author: Pavan Deolasee <pav...@gm...> Date: Tue Mar 8 16:45:12 2011 +0530 First cut implementation of BARRIER for PITR and global consistent recovery diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 8038b25..d989a59 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -20,11 +20,13 @@ #include "commands/dbcommands.h" #include "commands/sequence.h" #include "commands/tablespace.h" +#ifdef PGXC +#include "pgxc/barrier.h" +#endif #include "storage/freespace.h" #include "storage/standby.h" #include "utils/relmapper.h" - const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL}, {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL}, @@ -42,4 +44,8 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} +#ifdef PGXC + , + {"Barrier", barrier_redo, barrier_desc, NULL, NULL, NULL} +#endif }; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7fbccc5..368cd69 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -39,6 +39,7 @@ #include "funcapi.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#include "pgxc/barrier.h" #include "pgstat.h" #include "postmaster/bgwriter.h" #include "replication/walreceiver.h" @@ -184,6 +185,7 @@ static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; static bool recoveryTargetInclusive = true; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; +static char *recoveryTargetBarrierId; /* options taken from recovery.conf for XLOG streaming */ static bool StandbyMode = false; @@ -5258,6 +5260,13 @@ readRecoveryCommandFile(void) (errmsg("trigger_file = '%s'", TriggerFile))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_barrier_id") == 0) + { + recoveryTarget = true; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", @@ -5451,11 +5460,20 @@ static bool recoveryStopsHere(XLogRecord *record, bool *includeThis) { bool stopsHere; +#ifdef PGXC + bool stopsAtThisBarrier; + char *recordBarrierId; +#endif uint8 record_info; TimestampTz recordXtime; +#ifdef PGXC + /* We only consider stoppping at COMMIT, ABORT or BARRIER records */ + if ((record->xl_rmid != RM_XACT_ID) && (record->xl_rmid != RM_BARRIER_ID)) +#else /* We only consider stopping at COMMIT or ABORT records */ if (record->xl_rmid != RM_XACT_ID) +#endif return false; record_info = record->xl_info & ~XLR_INFO_MASK; if (record_info == XLOG_XACT_COMMIT) @@ -5472,6 +5490,12 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); recordXtime = recordXactAbortData->xact_time; } +#ifdef PGXC + else if (record_info == XLOG_BARRIER_CREATE) + { + recordBarrierId = (char *) XLogRecGetData(record); + } +#endif else return false; @@ -5497,6 +5521,13 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (stopsHere) *includeThis = recoveryTargetInclusive; } +#ifdef PGXC + else if (recoveryTargetBarrierId) + { + if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) + stopsAtThisBarrier = true; + } +#endif else { /* @@ -5548,6 +5579,17 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (recoveryStopAfter) SetLatestXTime(recordXtime); } +#ifdef PGXC + else if (stopsAtThisBarrier) + { + recoveryStopTime = recordXtime; + ereport(LOG, + (errmsg("recovery stopping at barrier %s, time %s", + recoveryTargetBarrierId, + timestamptz_to_str(recoveryStopTime)))); + return true; + } +#endif else SetLatestXTime(recordXtime); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 2c7fee4..c9581e1 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3726,6 +3726,18 @@ _copyValue(Value *from) return newnode; } +#ifdef PGXC +static BarrierStmt * +_copyBarrierStmt(BarrierStmt *from) +{ + BarrierStmt *newnode = makeNode(BarrierStmt); + + COPY_STRING_FIELD(id); + + return newnode; +} +#endif + /* * copyObject * @@ -4307,6 +4319,11 @@ copyObject(void *from) case T_CheckPointStmt: retval = (void *) makeNode(CheckPointStmt); break; +#ifdef PGXC + case T_BarrierStmt: + retval = _copyBarrierStmt(from); + break; +#endif case T_CreateSchemaStmt: retval = _copyCreateSchemaStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c5b46bb..d1dad1c 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2361,6 +2361,16 @@ _equalValue(Value *a, Value *b) return true; } +#ifdef PGXC + +static bool +_equalBarrierStmt(BarrierStmt *a, BarrierStmt *b) +{ + COMPARE_STRING_FIELD(id); + return true; +} +#endif + /* * equal * returns whether two nodes are equal @@ -2811,6 +2821,11 @@ equal(void *a, void *b) case T_CheckPointStmt: retval = true; break; +#ifdef PGXC + case T_BarrierStmt: + retval = _equalBarrierStmt(a, b); + break; +#endif case T_CreateSchemaStmt: retval = _equalCreateSchemaStmt(a, b); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 8ac6002..7ec9491 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -216,6 +216,7 @@ static TypeName *TableFuncTypeName(List *columns); DeallocateStmt PrepareStmt ExecuteStmt DropOwnedStmt ReassignOwnedStmt AlterTSConfigurationStmt AlterTSDictionaryStmt + BarrierStmt %type <node> select_no_parens select_with_parens select_clause simple_select values_clause @@ -445,6 +446,7 @@ static TypeName *TableFuncTypeName(List *columns); opt_frame_clause frame_extent frame_bound %type <str> opt_existing_window_name /* PGXC_BEGIN */ +%type <str> opt_barrier_id %type <distby> OptDistributeBy /* PGXC_END */ @@ -470,12 +472,12 @@ static TypeName *TableFuncTypeName(List *columns); */ /* ordinary key words in alphabetical order */ -/* PGXC - added REPLICATION, DISTRIBUTE, MODULO and HASH */ +/* PGXC - added REPLICATION, DISTRIBUTE, MODULO, BARRIER and HASH */ %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION - BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT + BACKWARD BARRIER BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT BOOLEAN_P BOTH BY CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P @@ -683,6 +685,7 @@ stmt : | AlterUserSetStmt | AlterUserStmt | AnalyzeStmt + | BarrierStmt | CheckPointStmt | CleanConnStmt | ClosePortalStmt @@ -6985,6 +6988,28 @@ opt_name_list: ; +/* PGXC_BEGIN */ +BarrierStmt: CREATE BARRIER opt_barrier_id + { + BarrierStmt *n = makeNode(BarrierStmt); + n->id = $3; + $$ = (Node *)n; + } + ; + +opt_barrier_id: + Sconst + { + $$ = pstrdup($1); + } + | /* EMPTY */ + { + $$ = NULL; + } + ; + +/* PGXC_END */ + /***************************************************************************** * * QUERY: @@ -10997,7 +11022,7 @@ ColLabel: IDENT { $$ = $1; } /* "Unreserved" keywords --- available for use as any kind of name. */ -/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO */ +/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO, BARRIER */ unreserved_keyword: ABORT_P | ABSOLUTE_P @@ -11014,6 +11039,9 @@ unreserved_keyword: | ASSIGNMENT | AT | BACKWARD +/* PGXC_BEGIN */ + | BARRIER +/* PGXC_END */ | BEFORE | BEGIN_P | BY diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile index eecac20..ad6bb64 100644 --- a/src/backend/pgxc/Makefile +++ b/src/backend/pgxc/Makefile @@ -11,6 +11,6 @@ subdir = src/backend/pgxc top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = locator plan pool +SUBDIRS = locator plan pool barrier include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/barrier/Makefile b/src/backend/pgxc/barrier/Makefile new file mode 100644 index 0000000..d80bbec --- /dev/null +++ b/src/backend/pgxc/barrier/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for pool +# +# Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/barrier +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = barrier.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c new file mode 100644 index 0000000..3e1d7cc --- /dev/null +++ b/src/backend/pgxc/barrier/barrier.c @@ -0,0 +1,493 @@ +/*------------------------------------------------------------------------- + * + * barrier.c + * + * Barrier handling for PITR + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "pgxc/barrier.h" +#include "pgxc/execRemote.h" +#include "pgxc/locator.h" +#include "pgxc/pgxc.h" +#include "pgxc/pgxcnode.h" +#include "storage/lwlock.h" +#include "tcop/dest.h" + +static const char *generate_barrier_id(const char *id); +static PGXCNodeAllHandles *PrepareBarrier(const char *id); +static void ExecuteBarrier(const char *id); +static void EndBarrier(PGXCNodeAllHandles *handles, const char *id); + +extern void ProcessCreateBarrierPrepare(const char *id); +extern void ProcessCreateBarrierEnd(const char *id); +extern void ProcessCreateBarrierExecute(const char *id); + +/* + * Prepare ourselves for an incoming BARRIER. We must disable all new 2PC + * commits and let the ongoing commits to finish. We then remember the + * barrier id (so that it can be matched with the final END message) and + * tell the driving coordinator to proceed with the next step. + * + * A simple way to implement this is to grab a lock in an exclusive mode + * while all other backend starting a 2PC will grab the lock in shared + * mode. So as long as we hold the exclusive lock, no other backend start a + * new 2PC and there can not be any 2PC in-progress. This technique would + * rely on assumption that an exclsuive lock requester is not starved by + * share lock requesters. + * + * Note: To ensure that the 2PC are not blocked for a long time, we should + * set a timeout. The lock should be release after the timeout and the + * barrier should be canceled. + */ +void +ProcessCreateBarrierPrepare(const char *id) +{ + StringInfoData buf; + + if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER PREPARE message is expected to " + "arrive at a coordinator from another coordinator"))); + + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); + + /* + * TODO Start a timer to terminate the pending barrier after a specified + * timeout + */ +} + +/* + * Mark the completetion of an on-going barrier. We must have remembered the + * barrier ID when we received the CREATE BARRIER PREPARE command + */ +void +ProcessCreateBarrierEnd(const char *id) +{ + StringInfoData buf; + + if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER END message is expected to " + "arrive at a coordinator from another coordinator"))); + + LWLockRelease(BarrierLock); + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); + + /* + * TODO Stop the timer + */ +} + +/* + * Execute the CREATE BARRIER comamnd. Write a BARRIER WAL record and flush the + * WAL buffers to disk before returning to the caller. Writing the WAL record + * does not guarantee successful completion of the barrier command. + */ +void +ProcessCreateBarrierExecute(const char *id) +{ + StringInfoData buf; + + if (!IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER EXECUTE message is expected to " + "arrive from a coordinator"))); + { + XLogRecData rdata[1]; + XLogRecPtr recptr; + + rdata[0].data = (char *) id; + rdata[0].len = strlen(id) + 1; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); + XLogFlush(recptr); + } + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); +} + +static const char * +generate_barrier_id(const char *id) +{ + /* + * TODO If the caller can passeed a NULL value, generate an id which is + * guaranteed to be unique across the cluster. We can use a combination of + * the coordinator node id and a timestamp. This may not be complete if we + * support changing coordinator ids without initdb or the system clocks are + * modified. + * + * Another option would be to let the GTM issue globally unique barrier + * IDs. For the time being, we leave it to the user to come up with an + * unique identifier + */ + return id ? id : pstrdup("dummy_barrier_id"); +} + +static PGXCNodeAllHandles * +SendBarrierPrepareRequest(List *coords, const char *id) +{ + PGXCNodeAllHandles *coord_handles; + int conn; + int msglen; + int barrier_idlen; + + coord_handles = get_handles(NIL, coords, true); + + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + + return coord_handles; +} + +static void +CheckBarrierCommandStatus(PGXCNodeAllHandles *conn_handles, const char *id, + const char *command) +{ + int conn; + int count = conn_handles->co_conn_count + conn_handles->dn_conn_count; + + elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command); + + for (conn = 0; conn < count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + if (pgxc_node_receive(1, &handle, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive response from the remote side"))); + + if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER PREPARE command failed " + "with error %s", handle->error))); + } + + elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on " + "all nodes", id, command); +} + +static void +SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id) +{ + int conn; + int msglen; + int barrier_idlen; + + elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all coordinators", id); + + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + +} + +/* + * Prepare all coordinators for barrier. During this step all the coordinators + * are informed to suspend any new 2PC transactions. The coordinators should + * disable new 2PC transactions and then wait for the existing transactions to + * complete. Once all "in-flight" 2PC transactions are over, the coordinators + * respond back. + * + * That completes the first step in barrier generation + * + * Any errors will be reported via ereport. + */ +static PGXCNodeAllHandles * +PrepareBarrier(const char *id) +{ + PGXCNodeAllHandles *coord_handles; + + elog(DEBUG2, "Preparing coordinators for BARRIER"); + + /* + * Send a CREATE BARRIER PREPARE message to all the coordinators. We should + * send an asynchronous request so that we can disable local commits and + * then wait for the remote coordinators to finish the work + */ + coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id); + + /* + * Disable local commits + */ + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + + elog(DEBUG2, "Disabled 2PC commits origniating at the diriving coordinator"); + + /* + * TODO Start a timer to cancel the barrier request in case of a timeout + */ + + /* + * Local in-flight commits are now over. Check status of the remote + * coordinators + */ + CheckBarrierCommandStatus(coord_handles, id, "PREPARE"); + + return coord_handles; +} + +/* + * Execute the barrier command on all the components, including data nodes and + * coordinators. + */ +static void +ExecuteBarrier(const char *id) +{ + List *barrierDataNodeList = GetAllDataNodes(); + List *barrierCoordList = GetAllCoordNodes(); + PGXCNodeAllHandles *conn_handles; + int conn; + int msglen; + int barrier_idlen; + + conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false); + + elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to " + "data nodes and coordinator", id); + /* + * Send a CREATE BARRIER request to all the data nodes and the coordinators + */ + for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + + CheckBarrierCommandStatus(conn_handles, id, "EXECUTE"); + + /* + * Also WAL log the BARRIER locally and flush the WAL buffers to disk + */ +} + +/* + * Resume 2PC commits on the local as well as remote coordinators. + */ +static void +EndBarrier(PGXCNodeAllHandles *prepared_handles, const char *id) +{ + /* Resume 2PC locally */ + LWLockRelease(BarrierLock); + + SendBarrierEndRequest(prepared_handles, id); + + CheckBarrierCommandStatus(prepared_handles, id, "END"); +} + +void +RequestBarrier(const char *id, char *completionTag) +{ + PGXCNodeAllHandles *prepared_handles; + const char *barrier_id; + + elog(DEBUG2, "CREATE BARRIER request received"); + /* + * Ensure that we are a coordinator and the request is not from another + * coordinator + */ + if (!IS_PGXC_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command must be sent to a coordinator"))); + + if (IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command is not expected from another coordinator"))); + + /* + * Get a barrier id if the user has not supplied it + */ + barrier_id = generate_barrier_id(id); + + elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id); + + /* + * Step One. Prepare all coordinators for upcoming barrier request + */ + prepared_handles = PrepareBarrier(barrier_id); + + /* + * Step two. Issue BARRIER command to all involved components, including + * coordinators and data nodes + */ + ExecuteBarrier(barrier_id); + + /* + * Step three. Inform coordinators about a successfully completed barrier + */ + EndBarrier(prepared_handles, barrier_id); + + if (completionTag) + snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id); +} + +void +barrier_redo(XLogRecPtr lsn, XLogRecord *record) +{ + /* Nothing to do */ + return; +} + +void +barrier_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + Assert(xl_info == XLOG_BARRIER_CREATE); + appendStringInfo(buf, "BARRIER %s", rec); +} diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 03482a0..c297003 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1389,6 +1389,7 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, * RESPONSE_TUPLEDESC - got tuple description * RESPONSE_DATAROW - got data row * RESPONSE_COPY - got copy response + * RESPONSE_BARRIER_OK - barrier command completed successfully */ int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) @@ -1500,6 +1501,16 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) #endif return result; } + +#ifdef PGXC + case 'b': + { + Assert((strncmp(msg, conn->barrier_id, msg_len) == 0)); + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + } +#endif + case 'I': /* EmptyQuery */ default: /* sync lost? */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 71b1398..c54bd60 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -80,6 +80,7 @@ #include "access/gtm.h" /* PGXC_COORD */ #include "pgxc/execRemote.h" +#include "pgxc/barrier.h" #include "pgxc/planner.h" #include "pgxc/pgxcnode.h" #include "commands/copy.h" @@ -447,6 +448,7 @@ SocketBackend(StringInfo inBuf) case 'g': /* GXID */ case 's': /* Snapshot */ case 't': /* Timestamp */ + case 'b': /* Barrier */ break; #endif @@ -4290,6 +4292,37 @@ PostgresMain(int argc, char *argv[], const char *username) */ SetCurrentGTMDeltaTimestamp(timestamp); break; + + case 'b': /* barrier */ + { + int command; + char *id; + + command = pq_getmsgbyte(&input_message); + id = pq_getmsgstring(&input_message); + pq_... [truncated message content] |