You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Michael P. <mic...@us...> - 2010-08-23 08:15:17
|
Project "Postgres-XC". The tag, v0.9.2 has been deleted was d7ca431066efe320107581186ab853b28fa5f7a7 ----------------------------------------------------------------------- d7ca431066efe320107581186ab853b28fa5f7a7 Support for cold synchronization of catalog table of coordinator. ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2010-08-23 08:08:59
|
Project "Postgres-XC". The branch, master has been updated via 9894afcd6d20b47c303c49b8ed5141d2b7902237 (commit) from ba6f32f142cf8731ba29e5495e0f97f3b0455da0 (commit) - Log ----------------------------------------------------------------- commit 9894afcd6d20b47c303c49b8ed5141d2b7902237 Author: Michael P <mic...@us...> Date: Mon Aug 23 17:00:01 2010 +0900 Support for Global timestamp in Postgres-XC. When a transaction is begun on Coordinator, a transaction sending a BEGIN message to GTM receives back a timestamp with the usual GXID. This timestamp is calculated from the clock of GTM server. With that, nodes in the cluster can adjust their own timeline with GTM by calculating a delta value based on the GTM timestamp and their local clock. Like GXID and snapshot, a timestamp is also sent down to Datanodes in case so as to keep consistent timestamp values between coordinator and datanodes. This commit supports global timestamp values for now(), statement_timestamp, transaction_timestamp,current_date, current_time, current_timestamp, localtime, local_timestamp and now(). clock_timestamp and timeofday make their calculation based on the local server clock so they get their results from the local node where it is run. Their use could lead to inconsistencies if used in a transaction involving several Datanodes. diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index f9499c9..c7f3547 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -67,14 +67,14 @@ CloseGTM(void) } GlobalTransactionId -BeginTranGTM(void) +BeginTranGTM(GTM_Timestamp *timestamp) { GlobalTransactionId xid = InvalidGlobalTransactionId; CheckConnection(); // TODO Isolation level if (conn) - xid = begin_transaction(conn, GTM_ISOLATION_RC); + xid = begin_transaction(conn, GTM_ISOLATION_RC, timestamp); /* If something went wrong (timeout), try and reset GTM connection * and retry. This is safe at the beginning of a transaction. @@ -84,7 +84,7 @@ BeginTranGTM(void) CloseGTM(); InitGTM(); if (conn) - xid = begin_transaction(conn, GTM_ISOLATION_RC); + xid = begin_transaction(conn, GTM_ISOLATION_RC, timestamp); } return xid; } diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index f2a9d74..5176e85 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -75,11 +75,17 @@ GetForceXidFromGTM(void) * The new XID is also stored into MyProc before returning. */ TransactionId +#ifdef PGXC +GetNewTransactionId(bool isSubXact, bool *timestamp_received, GTM_Timestamp *timestamp) +#else GetNewTransactionId(bool isSubXact) +#endif { TransactionId xid; -#ifdef PGXC +#ifdef PGXC bool increment_xid = true; + + *timestamp_received = false; #endif /* @@ -102,8 +108,10 @@ GetNewTransactionId(bool isSubXact) * This will help with GTM connection issues- we will not * block all other processes. */ - xid = (TransactionId) BeginTranGTM(); + xid = (TransactionId) BeginTranGTM(timestamp); + *timestamp_received = true; } + #endif LWLockAcquire(XidGenLock, LW_EXCLUSIVE); @@ -144,18 +152,20 @@ GetNewTransactionId(bool isSubXact) * exclude it from other snapshots. */ next_xid = (TransactionId) BeginTranAutovacuumGTM(); - } else { + } + else + { elog (DEBUG1, "Getting XID for autovacuum worker (analyze)"); /* try and get gxid directly from GTM */ - next_xid = (TransactionId) BeginTranGTM(); + next_xid = (TransactionId) BeginTranGTM(NULL); } } else if (GetForceXidFromGTM()) { elog (DEBUG1, "Force get XID from GTM"); /* try and get gxid directly from GTM */ - next_xid = (TransactionId) BeginTranGTM(); + next_xid = (TransactionId) BeginTranGTM(NULL); } - + if (TransactionIdIsValid(next_xid)) { xid = next_xid; diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 673aad1..8a946cc 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -208,6 +208,19 @@ static TimestampTz stmtStartTimestamp; static TimestampTz xactStopTimestamp; /* + * PGXC receives from GTM a timestamp value at the same time as a GXID + * This one is set as GTMxactStartTimestamp and is a return value of now(), current_transaction(). + * GTMxactStartTimestamp is also sent to each node with gxid and snapshot and delta is calculated locally. + * GTMdeltaTimestamp is used to calculate current_statement as its value can change + * during a transaction. Delta can have a different value through the nodes of the cluster + * but its uniqueness in the cluster is maintained thanks to the global value GTMxactStartTimestamp. + */ +#ifdef PGXC +static TimestampTz GTMxactStartTimestamp = 0; +static TimestampTz GTMdeltaTimestamp = 0; +#endif + +/* * GID to be used for preparing the current transaction. This is also * global to a whole transaction, so we don't keep it in the state stack. */ @@ -315,12 +328,28 @@ GetCurrentGlobalTransactionId(void) * * This will return the GXID of the specified transaction, * getting one from the GTM if it's not yet set. + * It also returns a timestamp value if a GXID has been taken from GTM */ static GlobalTransactionId GetGlobalTransactionId(TransactionState s) { + GTM_Timestamp gtm_timestamp; + bool received_tp; + + /* + * Here we receive timestamp at the same time as gxid. + */ if (!GlobalTransactionIdIsValid(s->globalTransactionId)) - s->globalTransactionId = (GlobalTransactionId) GetNewTransactionId(s->parent != NULL); + s->globalTransactionId = (GlobalTransactionId) GetNewTransactionId(s->parent != NULL, + &received_tp, + >m_timestamp); + + /* Set a timestamp value if and only if it has been received from GTM */ + if (received_tp) + { + GTMxactStartTimestamp = (TimestampTz) gtm_timestamp; + GTMdeltaTimestamp = GTMxactStartTimestamp - stmtStartTimestamp; + } return s->globalTransactionId; } @@ -473,8 +502,20 @@ AssignTransactionId(TransactionState s) s->transactionId, isSubXact ? "true" : "false"); } else -#endif + { + GTM_Timestamp gtm_timestamp; + bool received_tp; + + s->transactionId = GetNewTransactionId(isSubXact, &received_tp, >m_timestamp); + if (received_tp) + { + GTMxactStartTimestamp = (TimestampTz) gtm_timestamp; + GTMdeltaTimestamp = GTMxactStartTimestamp - stmtStartTimestamp; + } + } +#else s->transactionId = GetNewTransactionId(isSubXact); +#endif if (isSubXact) SubTransSetParent(s->transactionId, s->parent->transactionId); @@ -536,7 +577,15 @@ GetCurrentCommandId(bool used) TimestampTz GetCurrentTransactionStartTimestamp(void) { + /* + * In Postgres-XC, Transaction start timestamp is the value received + * from GTM along with GXID. + */ +#ifdef PGXC + return GTMxactStartTimestamp; +#else return xactStartTimestamp; +#endif } /* @@ -545,7 +594,17 @@ GetCurrentTransactionStartTimestamp(void) TimestampTz GetCurrentStatementStartTimestamp(void) { + /* + * For Postgres-XC, Statement start timestamp is adjusted at each node + * (Coordinator and Datanode) with a difference value that is calculated + * based on the global timestamp value received from GTM and the local + * clock. This permits to follow the GTM timeline in the cluster. + */ +#ifdef PGXC + return stmtStartTimestamp + GTMdeltaTimestamp; +#else return stmtStartTimestamp; +#endif } /* @@ -557,11 +616,36 @@ GetCurrentStatementStartTimestamp(void) TimestampTz GetCurrentTransactionStopTimestamp(void) { + /* + * As for Statement start timestamp, stop timestamp has to + * be adjusted with the delta value calculated with the + * timestamp received from GTM and the local node clock. + */ +#ifdef PGXC + TimestampTz timestamp; + + if (xactStopTimestamp != 0) + return xactStopTimestamp + GTMdeltaTimestamp; + + timestamp = GetCurrentTimestamp() + GTMdeltaTimestamp; + + return timestamp; +#else if (xactStopTimestamp != 0) return xactStopTimestamp; + return GetCurrentTimestamp(); +#endif } +#ifdef PGXC +TimestampTz +GetCurrentGTMStartTimestamp(void) +{ + return GTMxactStartTimestamp; +} +#endif + /* * SetCurrentStatementStartTimestamp */ @@ -580,6 +664,20 @@ SetCurrentTransactionStopTimestamp(void) xactStopTimestamp = GetCurrentTimestamp(); } +#ifdef PGXC +/* + * SetCurrentGTMDeltaTimestamp + * + * Note: Sets local timestamp delta with the value received from GTM + */ +void +SetCurrentGTMDeltaTimestamp(TimestampTz timestamp) +{ + GTMxactStartTimestamp = timestamp; + GTMdeltaTimestamp = GTMxactStartTimestamp - xactStartTimestamp; +} +#endif + /* * GetCurrentTransactionNestLevel * @@ -950,7 +1048,12 @@ RecordTransactionCommit(void) MyProc->inCommit = true; SetCurrentTransactionStopTimestamp(); +#ifdef PGXC + /* In Postgres-XC, stop timestamp has to follow the timeline of GTM */ + xlrec.xact_time = xactStopTimestamp + GTMdeltaTimestamp; +#else xlrec.xact_time = xactStopTimestamp; +#endif xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); @@ -1275,7 +1378,12 @@ RecordTransactionAbort(bool isSubXact) else { SetCurrentTransactionStopTimestamp(); +#ifdef PGXC + /* In Postgres-XC, stop timestamp has to follow the timeline of GTM */ + xlrec.xact_time = xactStopTimestamp + GTMdeltaTimestamp; +#else xlrec.xact_time = xactStopTimestamp; +#endif } xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; @@ -1576,7 +1684,12 @@ StartTransaction(void) */ xactStartTimestamp = stmtStartTimestamp; xactStopTimestamp = 0; +#ifdef PGXC + /* For Postgres-XC, transaction start timestamp has to follow the GTM timeline */ + pgstat_report_xact_timestamp(GTMxactStartTimestamp); +#else pgstat_report_xact_timestamp(xactStartTimestamp); +#endif /* * initialize current transaction state fields diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c index 0f4072d..ba56ca1 100644 --- a/src/backend/pgxc/pool/datanode.c +++ b/src/backend/pgxc/pool/datanode.c @@ -893,6 +893,48 @@ data_node_send_snapshot(DataNodeHandle *handle, Snapshot snapshot) } /* + * Send the timestamp down to the Datanode + */ +int +data_node_send_timestamp(DataNodeHandle *handle, TimestampTz timestamp) +{ + int msglen = 12; /* 4 bytes for msglen and 8 bytes for timestamp (int64) */ + uint32 n32; + int64 i = (int64) timestamp; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + handle->outBuffer[handle->outEnd++] = 't'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + /* High order half first */ +#ifdef INT64_IS_BUSTED + /* don't try a right shift of 32 on a 32-bit word */ + n32 = (i < 0) ? -1 : 0; +#else + n32 = (uint32) (i >> 32); +#endif + n32 = htonl(n32); + memcpy(handle->outBuffer + handle->outEnd, &n32, 4); + handle->outEnd += 4; + + /* Now the low order half */ + n32 = (uint32) i; + n32 = htonl(n32); + memcpy(handle->outBuffer + handle->outEnd, &n32, 4); + handle->outEnd += 4; + + return 0; +} + + +/* * Add another message to the list of errors to be returned back to the client * at the convenient time */ diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 43569e0..f065289 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1074,6 +1074,7 @@ data_node_begin(int conn_count, DataNodeHandle ** connections, int i; struct timeval *timeout = NULL; RemoteQueryState *combiner; + TimestampTz timestamp = GetCurrentGTMStartTimestamp(); /* Send BEGIN */ for (i = 0; i < conn_count; i++) @@ -1081,6 +1082,9 @@ data_node_begin(int conn_count, DataNodeHandle ** connections, if (GlobalTransactionIdIsValid(gxid) && data_node_send_gxid(connections[i], gxid)) return EOF; + if (GlobalTimestampIsValid(timestamp) && data_node_send_timestamp(connections[i], timestamp)) + return EOF; + if (data_node_send_query(connections[i], "BEGIN")) return EOF; } @@ -1222,8 +1226,13 @@ data_node_commit(int conn_count, DataNodeHandle ** connections) else sprintf(buffer, "COMMIT PREPARED 'T%d'", gxid); - /* We need to use a new xid, the data nodes have reset */ - two_phase_xid = BeginTranGTM(); + /* + * We need to use a new xid, the data nodes have reset + * Timestamp has already been set with BEGIN on remote Datanodes, + * so don't use it here. + */ + two_phase_xid = BeginTranGTM(NULL); + for (i = 0; i < conn_count; i++) { if (data_node_send_gxid(connections[i], two_phase_xid)) @@ -1338,6 +1347,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ bool need_tran; GlobalTransactionId gxid; RemoteQueryState *combiner; + TimestampTz timestamp = GetCurrentGTMStartTimestamp(); if (conn_count == 0) return NULL; @@ -1432,6 +1442,19 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ pfree(copy_connections); return NULL; } + if (conn_count == 1 && data_node_send_timestamp(connections[i], timestamp)) + { + /* + * If a transaction involves multiple connections timestamp, is + * always sent down to Datanodes with data_node_begin. + * An autocommit transaction needs the global timestamp also, + * so handle this case here. + */ + add_error_message(connections[i], "Can not send request"); + pfree(connections); + pfree(copy_connections); + return NULL; + } if (snapshot && data_node_send_snapshot(connections[i], snapshot)) { add_error_message(connections[i], "Can not send request"); @@ -2027,7 +2050,8 @@ ExecRemoteQuery(RemoteQueryState *node) bool force_autocommit = step->force_autocommit; bool is_read_only = step->read_only; GlobalTransactionId gxid = InvalidGlobalTransactionId; - Snapshot snapshot = GetActiveSnapshot(); + Snapshot snapshot = GetActiveSnapshot(); + TimestampTz timestamp = GetCurrentGTMStartTimestamp(); DataNodeHandle **connections = NULL; DataNodeHandle **primaryconnection = NULL; int i; @@ -2133,6 +2157,20 @@ ExecRemoteQuery(RemoteQueryState *node) (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to data nodes"))); } + if (total_conn_count == 1 && data_node_send_timestamp(primaryconnection[0], timestamp)) + { + /* + * If a transaction involves multiple connections timestamp is + * always sent down to Datanodes with data_node_begin. + * An autocommit transaction needs the global timestamp also, + * so handle this case here. + */ + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } if (snapshot && data_node_send_snapshot(primaryconnection[0], snapshot)) { pfree(connections); @@ -2184,6 +2222,20 @@ ExecRemoteQuery(RemoteQueryState *node) (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to data nodes"))); } + if (total_conn_count == 1 && data_node_send_timestamp(connections[i], timestamp)) + { + /* + * If a transaction involves multiple connections timestamp is + * always sent down to Datanodes with data_node_begin. + * An autocommit transaction needs the global timestamp also, + * so handle this case here. + */ + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } if (snapshot && data_node_send_snapshot(connections[i], snapshot)) { pfree(connections); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index a6f4767..84c70c6 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -429,8 +429,9 @@ SocketBackend(StringInfo inBuf) errmsg("invalid frontend message type %d", qtype))); break; #ifdef PGXC /* PGXC_DATANODE */ - case 'g': - case 's': + case 'g': /* GXID */ + case 's': /* Snapshot */ + case 't': /* Timestamp */ break; #endif @@ -2951,6 +2952,8 @@ PostgresMain(int argc, char *argv[], const char *username) int xmax; int xcnt; int *xip; + /* Timestamp info */ + TimestampTz timestamp; #endif #define PendingConfigOption(name,val) \ @@ -4015,6 +4018,17 @@ PostgresMain(int argc, char *argv[], const char *username) pq_getmsgend(&input_message); SetGlobalSnapshotData(xmin, xmax, xcnt, xip); break; + + case 't': /* timestamp */ + timestamp = (TimestampTz) pq_getmsgint64(&input_message); + pq_getmsgend(&input_message); + + /* + * Set in xact.x the static Timestamp difference value with GTM + * and the timestampreceivedvalues for Datanode reference + */ + SetCurrentGTMDeltaTimestamp(timestamp); + break; #endif /* PGXC */ default: diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 375a830..4634278 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -32,6 +32,10 @@ #include "utils/builtins.h" #include "utils/datetime.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif + /* * gcc's -ffast-math switch breaks routines that expect exact results from * expressions like timeval / SECS_PER_HOUR, where timeval is double. diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index 051bb1d..0847b0d 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -350,12 +350,22 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) break; case TXN_BEGIN_GETGXID_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid_tp.gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid_tp.timestamp, + sizeof (GTM_Timestamp), conn)) + result->gr_status = -1; + break; case TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT: case TXN_PREPARE_RESULT: if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid, sizeof (GlobalTransactionId), conn)) result->gr_status = -1; - break; + break; case TXN_COMMIT_RESULT: case TXN_ROLLBACK_RESULT: @@ -393,9 +403,11 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) result->gr_status = -1; break; } + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.timestamp, + sizeof (GTM_Timestamp), conn)) + result->gr_status = -1; break; - case TXN_COMMIT_MULTI_RESULT: case TXN_ROLLBACK_MULTI_RESULT: if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_rc_multi.txn_count, diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 9df28c7..35f81ae 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -48,7 +48,7 @@ disconnect_gtm(GTM_Conn *conn) * Transaction Management API */ GlobalTransactionId -begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel) +begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel, GTM_Timestamp *timestamp) { bool txn_read_only = false; GTM_Result *res = NULL; @@ -78,7 +78,12 @@ begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel) goto receive_failed; if (res->gr_status == 0) - return res->gr_resdata.grd_gxid; + { + if (timestamp) + *timestamp = res->gr_resdata.grd_gxid_tp.timestamp; + + return res->gr_resdata.grd_gxid_tp.gxid; + } else return InvalidGlobalTransactionId; diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile index 7fcdf82..5d8aaea 100644 --- a/src/gtm/main/Makefile +++ b/src/gtm/main/Makefile @@ -3,7 +3,7 @@ top_build_dir=../.. include $(top_build_dir)/gtm/Makefile.global -OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o ../common/libgtm.a ../libpq/libpqcomm.a ../path/libgtmpath.a +OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_time.o ../common/libgtm.a ../libpq/libpqcomm.a ../path/libgtmpath.a LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq LIBS=-lpthread diff --git a/src/gtm/main/gtm_time.c b/src/gtm/main/gtm_time.c new file mode 100644 index 0000000..ea795af --- /dev/null +++ b/src/gtm/main/gtm_time.c @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------- + * + * gtm_time.c + * Timestamp handling on GTM + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm.h" +#include "gtm/gtm_c.h" +#include "gtm/gtm_time.h" +#include <time.h> +#include <sys/time.h> + +GTM_Timestamp +GTM_TimestampGetCurrent(void) +{ + struct timeval tp; + GTM_Timestamp result; + + gettimeofday(&tp, NULL); + + result = (GTM_Timestamp) tp.tv_sec - + ((GTM_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY); + +#ifdef HAVE_INT64_TIMESTAMP + result = (result * USECS_PER_SEC) + tp.tv_usec; +#else + result = result + (tp.tv_usec / 1000000.0); +#endif + + return result; +} diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index 6090ae1..dec0a63 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -18,6 +18,8 @@ #include "gtm/palloc.h" #include "gtm/gtm.h" #include "gtm/gtm_txn.h" +#include "gtm/gtm_c.h" +#include "gtm/gtm_time.h" #include "gtm/assert.h" #include "gtm/stringinfo.h" #include "gtm/libpq.h" @@ -840,6 +842,7 @@ ProcessBeginTransactionCommand(Port *myport, StringInfo message) bool txn_read_only; StringInfoData buf; GTM_TransactionHandle txn; + GTM_Timestamp timestamp; MemoryContext oldContext; txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); @@ -860,6 +863,9 @@ ProcessBeginTransactionCommand(Port *myport, StringInfo message) MemoryContextSwitchTo(oldContext); + /* GXID has been received, now it's time to get a GTM timestamp */ + timestamp = GTM_TimestampGetCurrent(); + pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_RESULT, 4); if (myport->is_proxy) @@ -869,6 +875,7 @@ ProcessBeginTransactionCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); } pq_sendbytes(&buf, (char *)&txn, sizeof(txn)); + pq_sendbytes(&buf, (char *)×tamp, sizeof (GTM_Timestamp)); pq_endmessage(myport, &buf); if (!myport->is_proxy) @@ -1003,6 +1010,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) StringInfoData buf; GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS]; GlobalTransactionId gxid, end_gxid; + GTM_Timestamp timestamp; GTMProxy_ConnID txn_connid[GTM_MAX_GLOBAL_TRANSACTIONS]; MemoryContext oldContext; int count; @@ -1042,6 +1050,9 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) MemoryContextSwitchTo(oldContext); + /* GXID has been received, now it's time to get a GTM timestamp */ + timestamp = GTM_TimestampGetCurrent(); + end_gxid = gxid + txn_count; if (end_gxid < gxid) end_gxid += FirstNormalGlobalTransactionId; @@ -1058,6 +1069,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) } pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_sendbytes(&buf, (char *)&(timestamp), sizeof (GTM_Timestamp)); pq_endmessage(myport, &buf); if (!myport->is_proxy) diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index f5f6e65..66b1594 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -988,6 +988,7 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, { StringInfoData buf; GlobalTransactionId gxid; + GTM_Timestamp timestamp; switch (cmdinfo->ci_mtype) { @@ -1011,9 +1012,13 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, if (gxid < res->gr_resdata.grd_txn_get_multi.start_gxid) gxid += FirstNormalGlobalTransactionId; + /* Send back to each client the same timestamp value asked in this message */ + timestamp = res->gr_resdata.grd_txn_get_multi.timestamp; + pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4); pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)×tamp, sizeof (GTM_Timestamp)); pq_endmessage(cmdinfo->ci_conn->con_port, &buf); pq_flush(cmdinfo->ci_conn->con_port); } diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index 3831f09..4878d92 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -20,7 +20,7 @@ extern int GtmCoordinatorId; extern bool IsGTMConnected(void); extern void InitGTM(void); extern void CloseGTM(void); -extern GlobalTransactionId BeginTranGTM(void); +extern GlobalTransactionId BeginTranGTM(GTM_Timestamp *timestamp); extern GlobalTransactionId BeginTranAutovacuumGTM(void); extern int CommitTranGTM(GlobalTransactionId gxid); extern int RollbackTranGTM(GlobalTransactionId gxid); diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 6c2a5b8..d7c7b7b 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -16,7 +16,9 @@ #define TRANSAM_H #include "access/xlogdefs.h" - +#ifdef PGXC +#include "gtm/gtm_c.h" +#endif /* ---------------- * Special transaction ID values @@ -157,8 +159,10 @@ extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid); extern void SetNextTransactionId(TransactionId xid); extern void SetForceXidFromGTM(bool value); extern bool GetForceXidFromGTM(void); -#endif /* PGXC */ +extern TransactionId GetNewTransactionId(bool isSubXact, bool *timestamp_received, GTM_Timestamp *timestamp); +#else extern TransactionId GetNewTransactionId(bool isSubXact); +#endif /* PGXC */ extern TransactionId ReadNewTransactionId(void); extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Name oldest_datname); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 5bd157b..01fb498 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -157,6 +157,10 @@ extern TimestampTz GetCurrentTransactionStartTimestamp(void); extern TimestampTz GetCurrentStatementStartTimestamp(void); extern TimestampTz GetCurrentTransactionStopTimestamp(void); extern void SetCurrentStatementStartTimestamp(void); +#ifdef PGXC +extern TimestampTz GetCurrentGTMStartTimestamp(void); +extern void SetCurrentGTMDeltaTimestamp(TimestampTz timestamp); +#endif extern int GetCurrentTransactionNestLevel(void); extern bool TransactionIdIsCurrentTransactionId(TransactionId xid); extern void CommandCounterIncrement(void); diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index 1a04064..0a4c941 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -55,6 +55,12 @@ typedef int32 GTM_TransactionHandle; #define InvalidTransactionHandle -1 +/* + * As GTM and Postgres-XC packages are separated, GTM and XC's API + * use different type names for timestamps and sequences, but they have to be the same! + */ +typedef int64 GTM_Timestamp; /* timestamp data is 64-bit based */ + typedef int64 GTM_Sequence; /* a 64-bit sequence */ typedef struct GTM_SequenceKeyData { diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index 05e44bf..9db6884 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -21,8 +21,14 @@ typedef union GTM_ResultData { GTM_TransactionHandle grd_txnhandle; /* TXN_BEGIN */ - GlobalTransactionId grd_gxid; /* TXN_BEGIN_GETGXID - * TXN_PREPARE + + struct + { + GlobalTransactionId gxid; + GTM_Timestamp timestamp; + } grd_gxid_tp; /* TXN_BEGIN_GETGXID */ + + GlobalTransactionId grd_gxid; /* TXN_PREPARE * TXN_COMMIT * TXN_ROLLBACK */ @@ -47,6 +53,7 @@ typedef union GTM_ResultData { int txn_count; /* TXN_BEGIN_GETGXID_MULTI */ GlobalTransactionId start_gxid; + GTM_Timestamp timestamp; } grd_txn_get_multi; struct @@ -101,7 +108,7 @@ void disconnect_gtm(GTM_Conn *conn); /* * Transaction Management API */ -GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel); +GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel, GTM_Timestamp *timestamp); GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel); int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid); diff --git a/src/include/gtm/gtm_time.h b/src/include/gtm/gtm_time.h new file mode 100644 index 0000000..b3d7005 --- /dev/null +++ b/src/include/gtm/gtm_time.h @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------- + * + * gtm_time.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#ifndef GTM_TIME_H +#define GTM_TIME_H + +/* Julian-date equivalents of Day 0 in Unix and GTM reckoning */ +#define UNIX_EPOCH_JDATE 2440588 /* == date2j(1970, 1, 1) */ +#define GTM_EPOCH_JDATE 2451545 /* == date2j(2000, 1, 1) */ + +#define SECS_PER_YEAR (36525 * 864) /* avoid floating-point computation */ +#define SECS_PER_DAY 86400 +#define SECS_PER_HOUR 3600 +#define SECS_PER_MINUTE 60 +#define MINS_PER_HOUR 60 + +#ifdef HAVE_INT64_TIMESTAMP +#define USECS_PER_DAY INT64CONST(86400000000) +#define USECS_PER_HOUR INT64CONST(3600000000) +#define USECS_PER_MINUTE INT64CONST(60000000) +#define USECS_PER_SEC INT64CONST(1000000) +#endif + +GTM_Timestamp GTM_TimestampGetCurrent(void); + +#endif diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h index 849d84a..4202e2e 100644 --- a/src/include/pgxc/datanode.h +++ b/src/include/pgxc/datanode.h @@ -18,6 +18,7 @@ #define DATANODE_H #include "postgres.h" #include "gtm/gtm_c.h" +#include "utils/timestamp.h" #include "nodes/pg_list.h" #include "utils/snapshot.h" #include <unistd.h> @@ -88,6 +89,7 @@ extern int ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * hand extern int data_node_send_query(DataNodeHandle * handle, const char *query); extern int data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid); extern int data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot); +extern int data_node_send_timestamp(DataNodeHandle * handle, TimestampTz timestamp); extern int data_node_receive(const int conn_count, DataNodeHandle ** connections, struct timeval * timeout); diff --git a/src/include/utils/timestamp.h b/src/include/utils/timestamp.h index 906ceb6..801e89b 100644 --- a/src/include/utils/timestamp.h +++ b/src/include/utils/timestamp.h @@ -23,6 +23,10 @@ #include "utils/int8.h" #endif +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif + /* * Timestamp represents absolute time. * @@ -45,6 +49,11 @@ #ifdef HAVE_INT64_TIMESTAMP +/* + * PGXC note: GTM and Postgres-XC packages have to be separated. + * Both use use different type names for timestamp, but those types have to be the same! + */ + typedef int64 Timestamp; typedef int64 TimestampTz; typedef int64 TimeOffset; @@ -190,6 +199,10 @@ typedef struct #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) / 1000.0)) #endif +#ifdef PGXC +#define InvalidGlobalTimestamp ((TimestampTz) 0) +#define GlobalTimestampIsValid(timestamp) ((TimestampTz) (timestamp)) != InvalidGlobalTimestamp +#endif /* Set at postmaster start */ extern TimestampTz PgStartTime; ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/gtm.c | 6 +- src/backend/access/transam/varsup.c | 22 +++++-- src/backend/access/transam/xact.c | 117 ++++++++++++++++++++++++++++++++++- src/backend/pgxc/pool/datanode.c | 42 +++++++++++++ src/backend/pgxc/pool/execRemote.c | 58 ++++++++++++++++- src/backend/tcop/postgres.c | 18 +++++- src/backend/utils/adt/timestamp.c | 4 + src/gtm/client/fe-protocol.c | 16 ++++- src/gtm/client/gtm_client.c | 9 ++- src/gtm/main/Makefile | 2 +- src/gtm/main/gtm_time.c | 41 ++++++++++++ src/gtm/main/gtm_txn.c | 12 ++++ src/gtm/proxy/proxy_main.c | 5 ++ src/include/access/gtm.h | 2 +- src/include/access/transam.h | 8 ++- src/include/access/xact.h | 4 + src/include/gtm/gtm_c.h | 6 ++ src/include/gtm/gtm_client.h | 13 +++- src/include/gtm/gtm_time.h | 37 +++++++++++ src/include/pgxc/datanode.h | 2 + src/include/utils/timestamp.h | 13 ++++ 21 files changed, 410 insertions(+), 27 deletions(-) create mode 100644 src/gtm/main/gtm_time.c create mode 100644 src/include/gtm/gtm_time.h hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-08-23 06:25:06
|
Project "Postgres-XC". The branch, master has been updated via ba6f32f142cf8731ba29e5495e0f97f3b0455da0 (commit) from d97c52965478dafe7f5f2ccabc588c6279c117e7 (commit) - Log ----------------------------------------------------------------- commit ba6f32f142cf8731ba29e5495e0f97f3b0455da0 Author: Mason Sharp <ma...@us...> Date: Mon Aug 23 15:22:28 2010 +0900 Fix a visibility warning due to not taking into account transactions that are running globally across all nodes in the cluster. diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index be24657..5f97320 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -625,6 +625,11 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum) TransactionId result; int index; +#ifdef PGXC + if (TransactionIdIsValid(RecentGlobalXmin)) + return RecentGlobalXmin; +#endif + LWLockAcquire(ProcArrayLock, LW_SHARED); /* ----------------------------------------------------------------------- Summary of changes: src/backend/storage/ipc/procarray.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-08-23 04:27:42
|
Project "Postgres-XC". The branch, master has been updated via d97c52965478dafe7f5f2ccabc588c6279c117e7 (commit) from b6602543d5dd6dfa4005db41c73d0136f74af13e (commit) - Log ----------------------------------------------------------------- commit d97c52965478dafe7f5f2ccabc588c6279c117e7 Author: Mason Sharp <ma...@us...> Date: Mon Aug 23 13:24:57 2010 +0900 In Postgres-XC, when extedngin the clog the status assertion occasionally fails when under a very heavy for long tests. We break the two assertions out and make the second one this a warning instead of an assertion for now. diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 68e3869..919e146 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -555,8 +555,22 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); +#ifdef PGXC + /* + * In Postgres-XC the status assertion occasionally fails when + * under a very heavy for long tests. + * We break the two assertions out and make the second one + * this a warning instead of an assertion for now. + */ + Assert(shared->page_number[slotno] == pageno); + + if (shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS) + elog(WARNING, "Unexpected page status in SimpleLruWritePage(), status = %d, was expecting 3 (SLRU_PAGE_WRITE_IN_PROGRESS) for page %d", + shared->page_status[slotno], shared->page_number[slotno]); +#else Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); +#endif /* If we failed to write, mark the page dirty again */ if (!ok) ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/slru.c | 14 ++++++++++++++ 1 files changed, 14 insertions(+), 0 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-08-23 04:14:09
|
Project "Postgres-XC". The branch, master has been updated via b6602543d5dd6dfa4005db41c73d0136f74af13e (commit) from cfb29183b57e811e0dfcf3641c5cc58458b2584a (commit) - Log ----------------------------------------------------------------- commit b6602543d5dd6dfa4005db41c73d0136f74af13e Author: M S <masonsharp@5.105.180.203.e.iijmobile.jp> Date: Mon Aug 23 13:11:31 2010 +0900 Initial support for multi-step queries, including cross-node joins. Note that this is a "version 1.0" implementation, borrowing some code from the SQL/MED patch. This means that all cross-node joins take place on a Coordinator by pulling up data from the data nodes. Some queries will therefore execute quite slowly, but they will at least execute. In this patch, all columns are SELECTed from the remote table, but at least simple WHERE clauses are pushed down to the remote nodes. We will optimize query processing in the future. Note that the same connections to remote nodes are used in multiple steps. To get around that problem, we just add a materialization node above each RemoteQuery node, and force all results to be fetched first on the Coordinator. This patch also allows UNION, EXCEPT and INTERSECT, and other more complex SELECT statements to run now. It includes a fix for single-step, multi-node LIMIT and OFFSET. It also includes EXPLAIN output from the Coordinator's point of view. Adding these changes introduced a problem with AVG(), which is currently not working. diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 950a2f1..aa92917 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -565,6 +565,11 @@ explain_outNode(StringInfo str, case T_WorkTableScan: pname = "WorkTable Scan"; break; +#ifdef PGXC + case T_RemoteQuery: + pname = "Data Node Scan"; + break; +#endif case T_Material: pname = "Materialize"; break; @@ -668,6 +673,9 @@ explain_outNode(StringInfo str, case T_SeqScan: case T_BitmapHeapScan: case T_TidScan: +#ifdef PGXC + case T_RemoteQuery: +#endif if (((Scan *) plan)->scanrelid > 0) { RangeTblEntry *rte = rt_fetch(((Scan *) plan)->scanrelid, @@ -686,6 +694,26 @@ explain_outNode(StringInfo str, appendStringInfo(str, " %s", quote_identifier(rte->eref->aliasname)); } +#ifdef PGXC + if (IsA(plan, RemoteQuery)) + { + RemoteQuery *remote_query = (RemoteQuery *) plan; + + /* if it is a single-step plan, print out the sql being used */ + if (remote_query->sql_statement) + { + char *realsql = NULL; + realsql = strcasestr(remote_query->sql_statement, "explain"); + if (!realsql) + realsql = remote_query->sql_statement; + else + realsql += 8; /* skip "EXPLAIN" */ + + appendStringInfo(str, " %s", + quote_identifier(realsql)); + } + } +#endif break; case T_BitmapIndexScan: appendStringInfo(str, " on %s", @@ -854,6 +882,9 @@ explain_outNode(StringInfo str, case T_ValuesScan: case T_CteScan: case T_WorkTableScan: +#ifdef PGXC + case T_RemoteQuery: +#endif show_scan_qual(plan->qual, "Filter", ((Scan *) plan)->scanrelid, diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 22fd416..c8b1456 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2925,6 +2925,20 @@ ATRewriteTables(List **wqueue) } } +#ifdef PGXC + /* + * In PGXC, do not check the FK constraints on the Coordinator, and just return + * That is because a SELECT is generated whose plan will try and use + * the data nodes. We (currently) do not want to do that on the Coordinator, + * when the command is passed down to the data nodes it will + * peform the check locally. + * This issue was introduced when we added multi-step handling, + * it caused foreign key constraints to fail. + * PGXCTODO - issue for pg_catalog or any other cases? + */ + if (IS_PGXC_COORDINATOR) + return; +#endif /* * Foreign key constraints are checked in a final pass, since (a) it's * generally best to examine each one separately, and (b) it's at least diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 25f350f..01e2548 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -44,6 +44,9 @@ #include "executor/nodeWindowAgg.h" #include "executor/nodeWorktablescan.h" #include "nodes/nodeFuncs.h" +#ifdef PGXC +#include "pgxc/execRemote.h" +#endif #include "utils/syscache.h" @@ -183,6 +186,11 @@ ExecReScan(PlanState *node, ExprContext *exprCtxt) ExecWorkTableScanReScan((WorkTableScanState *) node, exprCtxt); break; +#ifdef PGXC + case T_RemoteQueryState: + ExecRemoteQueryReScan((RemoteQueryState *) node, exprCtxt); + break; +#endif case T_NestLoopState: ExecReScanNestLoop((NestLoopState *) node, exprCtxt); break; diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c index 446b400..2cd3298 100644 --- a/src/backend/executor/nodeMaterial.c +++ b/src/backend/executor/nodeMaterial.c @@ -24,6 +24,9 @@ #include "executor/executor.h" #include "executor/nodeMaterial.h" #include "miscadmin.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif /* ---------------------------------------------------------------- * ExecMaterial @@ -56,9 +59,24 @@ ExecMaterial(MaterialState *node) /* * If first time through, and we need a tuplestore, initialize it. */ +#ifdef PGXC + /* + * For PGXC, temporarily always create the storage. + * This allows us to easily use the same connection to + * in multiple steps of the plan. + */ + if ((IS_PGXC_COORDINATOR && tuplestorestate == NULL) + || (IS_PGXC_DATANODE && tuplestorestate == NULL && node->eflags != 0)) +#else if (tuplestorestate == NULL && node->eflags != 0) +#endif { tuplestorestate = tuplestore_begin_heap(true, false, work_mem); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + /* Note that we will rescan these results */ + node->eflags |= EXEC_FLAG_REWIND; +#endif tuplestore_set_eflags(tuplestorestate, node->eflags); if (node->eflags & EXEC_FLAG_MARK) { @@ -73,6 +91,26 @@ ExecMaterial(MaterialState *node) Assert(ptrno == 1); } node->tuplestorestate = tuplestorestate; + +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + TupleTableSlot *outerslot; + PlanState *outerNode = outerPlanState(node); + + /* We want to always materialize first temporarily in PG-XC */ + while (!node->eof_underlying) + { + outerslot = ExecProcNode(outerNode); + if (TupIsNull(outerslot)) + node->eof_underlying = true; + else + /* Append a copy of the returned tuple to tuplestore. */ + tuplestore_puttupleslot(tuplestorestate, outerslot); + } + tuplestore_rescan(node->tuplestorestate); + } +#endif } /* diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index a1aa660..0e9aa43 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -17,6 +17,7 @@ #include <math.h> +#include "catalog/pg_namespace.h" #include "nodes/nodeFuncs.h" #ifdef OPTIMIZER_DEBUG #include "nodes/print.h" @@ -33,7 +34,11 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif #include "rewrite/rewriteManip.h" +#include "utils/lsyscache.h" /* These parameters are set by GUC */ @@ -254,6 +259,18 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) * least one dimension of cost or sortedness. */ +#ifdef PGXC + /* + * If we are on the coordinator, we always want to use + * the remote query path unless it is a pg_catalog table. + */ + if (IS_PGXC_COORDINATOR + && get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE) + add_path(rel, create_remotequery_path(root, rel)); + else + { +#endif + /* Consider sequential scan */ add_path(rel, create_seqscan_path(root, rel)); @@ -262,6 +279,9 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) /* Consider TID scans */ create_tidscan_paths(root, rel); +#ifdef PGXC + } +#endif /* Now find the cheapest of the paths for this rel */ set_cheapest(rel); diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 6e5c251..337f17b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -32,6 +32,9 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#ifdef PGXC +#include "pgxc/planner.h" +#endif #include "utils/lsyscache.h" @@ -66,6 +69,10 @@ static CteScan *create_ctescan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); static WorkTableScan *create_worktablescan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); +#ifdef PGXC +static RemoteQuery *create_remotequery_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses); +#endif static NestLoop *create_nestloop_plan(PlannerInfo *root, NestPath *best_path, Plan *outer_plan, Plan *inner_plan); static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path, @@ -101,6 +108,10 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual, Index scanrelid, int ctePlanId, int cteParam); static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual, Index scanrelid, int wtParam); +#ifdef PGXC +static RemoteQuery *make_remotequery(List *qptlist, RangeTblEntry *rte, + List *qpqual, Index scanrelid); +#endif static BitmapAnd *make_bitmap_and(List *bitmapplans); static BitmapOr *make_bitmap_or(List *bitmapplans); static NestLoop *make_nestloop(List *tlist, @@ -162,6 +173,9 @@ create_plan(PlannerInfo *root, Path *best_path) case T_ValuesScan: case T_CteScan: case T_WorkTableScan: +#ifdef PGXC + case T_RemoteQuery: +#endif plan = create_scan_plan(root, best_path); break; case T_HashJoin: @@ -207,6 +221,9 @@ create_scan_plan(PlannerInfo *root, Path *best_path) List *tlist; List *scan_clauses; Plan *plan; +#ifdef PGXC + Plan *matplan; +#endif /* * For table scans, rather than using the relation targetlist (which is @@ -298,6 +315,23 @@ create_scan_plan(PlannerInfo *root, Path *best_path) scan_clauses); break; +#ifdef PGXC + case T_RemoteQuery: + plan = (Plan *) create_remotequery_plan(root, + best_path, + tlist, + scan_clauses); + + /* + * Insert a materialization plan above this temporarily + * until we better handle multiple steps using the same connection. + */ + matplan = (Plan *) make_material(plan); + copy_plan_costsize(matplan, plan); + matplan->total_cost += cpu_tuple_cost * matplan->plan_rows; + plan = matplan; + break; +#endif default: elog(ERROR, "unrecognized node type: %d", (int) best_path->pathtype); @@ -420,6 +454,9 @@ disuse_physical_tlist(Plan *plan, Path *path) case T_ValuesScan: case T_CteScan: case T_WorkTableScan: +#ifdef PGXC + case T_RemoteQuery: +#endif plan->targetlist = build_relation_tlist(path->parent); break; default: @@ -1544,6 +1581,46 @@ create_worktablescan_plan(PlannerInfo *root, Path *best_path, return scan_plan; } +#ifdef PGXC +/* + * create_remotequery_plan + * Returns a remotequery plan for the base relation scanned by 'best_path' + * with restriction clauses 'scan_clauses' and targetlist 'tlist'. + */ +static RemoteQuery * +create_remotequery_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses) +{ + RemoteQuery *scan_plan; + Index scan_relid = best_path->parent->relid; + RangeTblEntry *rte; + + + Assert(scan_relid > 0); + rte = planner_rt_fetch(scan_relid, root); + Assert(best_path->parent->rtekind == RTE_RELATION); + Assert(rte->rtekind == RTE_RELATION); + + /* Sort clauses into best execution order */ + scan_clauses = order_qual_clauses(root, scan_clauses); + + /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */ + scan_clauses = extract_actual_clauses(scan_clauses, false); + + scan_plan = make_remotequery(tlist, + rte, + scan_clauses, + scan_relid); + + copy_path_costsize(&scan_plan->scan.plan, best_path); + + /* PGXCTODO - get better estimates */ + scan_plan->scan.plan.plan_rows = 1000; + + return scan_plan; +} +#endif + /***************************************************************************** * @@ -2615,6 +2692,28 @@ make_worktablescan(List *qptlist, return node; } +#ifdef PGXC +static RemoteQuery * +make_remotequery(List *qptlist, + RangeTblEntry *rte, + List *qpqual, + Index scanrelid) +{ + RemoteQuery *node = makeNode(RemoteQuery); + Plan *plan = &node->scan.plan; + + /* cost should be inserted by caller */ + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = NULL; + plan->righttree = NULL; + node->scan.scanrelid = scanrelid; + node->read_only = true; + + return node; +} +#endif + Append * make_append(List *appendplans, bool isTarget, List *tlist) { diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index d839c28..cab7fb4 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -22,6 +22,9 @@ #include "optimizer/clauses.h" #include "optimizer/planmain.h" #include "optimizer/tlist.h" +#ifdef PGXC +#include "pgxc/planner.h" +#endif #include "parser/parsetree.h" #include "utils/lsyscache.h" #include "utils/syscache.h" @@ -373,6 +376,19 @@ set_plan_refs(PlannerGlobal *glob, Plan *plan, int rtoffset) fix_scan_list(glob, splan->scan.plan.qual, rtoffset); } break; +#ifdef PGXC + case T_RemoteQuery: + { + RemoteQuery *splan = (RemoteQuery *) plan; + + splan->scan.scanrelid += rtoffset; + splan->scan.plan.targetlist = + fix_scan_list(glob, splan->scan.plan.targetlist, rtoffset); + splan->scan.plan.qual = + fix_scan_list(glob, splan->scan.plan.qual, rtoffset); + } + break; +#endif case T_NestLoop: case T_MergeJoin: case T_HashJoin: diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index bde351d..e5c6dac 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1963,6 +1963,12 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params) bms_add_member(context.paramids, ((WorkTableScan *) plan)->wtParam); break; +#ifdef PGXC + case T_RemoteQuery: + //PGXCTODO + context.paramids = bms_add_members(context.paramids, valid_params); + break; +#endif case T_Append: { diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index b7c3d3c..cbf7618 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1310,6 +1310,28 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel) return pathnode; } +#ifdef PGXC +/* + * create_remotequery_path + * Creates a path corresponding to a scan of a remote query, + * returning the pathnode. + */ +Path * +create_remotequery_path(PlannerInfo *root, RelOptInfo *rel) +{ + Path *pathnode = makeNode(Path); + + pathnode->pathtype = T_RemoteQuery; + pathnode->parent = rel; + pathnode->pathkeys = NIL; /* result is always unordered */ + + // PGXCTODO - set cost properly + cost_seqscan(pathnode, root, rel); + + return pathnode; +} +#endif + /* * create_nestloop_path * Creates a pathnode corresponding to a nestloop join between two diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 1dcfc29..c8911b7 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -25,6 +25,7 @@ #include "nodes/nodes.h" #include "nodes/parsenodes.h" #include "optimizer/clauses.h" +#include "optimizer/planmain.h" #include "optimizer/planner.h" #include "optimizer/tlist.h" #include "parser/parse_agg.h" @@ -141,7 +142,7 @@ bool StrictSelectChecking = false; static Exec_Nodes *get_plan_nodes(Query *query, bool isRead); static bool get_plan_nodes_walker(Node *query_node, XCWalkerContext *context); static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context); - +static int handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stmt); /* * True if both lists contain only one node and are the same @@ -1528,16 +1529,6 @@ get_simple_aggregates(Query * query) simple_agg_list = lappend(simple_agg_list, simple_agg); } - else - { - /* - * PGXCTODO relax this limit after adding GROUP BY support - * then support expressions of aggregates - */ - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Query is not yet supported")))); - } column_pos++; } } @@ -1629,7 +1620,7 @@ reconstruct_step_query(List *rtable, bool has_order_by, List *extra_sort, { List *context; bool useprefix; - List *sub_tlist = step->plan.targetlist; + List *sub_tlist = step->scan.plan.targetlist; ListCell *l; StringInfo buf = makeStringInfo(); char *sql; @@ -1737,7 +1728,7 @@ make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) { List *sortcls = query->sortClause; List *distinctcls = query->distinctClause; - List *sub_tlist = step->plan.targetlist; + List *sub_tlist = step->scan.plan.targetlist; SimpleSort *sort; SimpleDistinct *distinct; ListCell *l; @@ -1978,6 +1969,100 @@ make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) } /* + * Special case optimization. + * Handle LIMIT and OFFSET for single-step queries on multiple nodes. + * + * Return non-zero if we need to fall back to the standard plan. + */ +static int +handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stmt) +{ + + /* check if no special handling needed */ + if (query_step && query_step->exec_nodes && + list_length(query_step->exec_nodes->nodelist) <= 1) + return 0; + + /* if order by and limit are present, do not optimize yet */ + if ((query->limitCount || query->limitOffset) && query->sortClause) + return 1; + + /* + * Note that query_step->is_single_step is set to true, but + * it is ok even if we add limit here. + * If OFFSET is set, we strip the final offset value and add + * it to the LIMIT passed down. If there is an OFFSET and no + * LIMIT, we just strip off OFFSET. + */ + if (query->limitOffset) + { + int64 newLimit = 0; + char *newpos; + char *pos; + char *limitpos; + char *newQuery; + char *newchar; + char *c; + + pos = NULL; + newpos = NULL; + + if (query->limitCount) + { + for (pos = query_step->sql_statement, newpos = pos; newpos != NULL; ) + { + pos = newpos; + newpos = strcasestr(pos+1, "LIMIT"); + } + limitpos = pos; + + if (IsA(query->limitCount, Const)) + newLimit = DatumGetInt64(((Const *) query->limitCount)->constvalue); + else + return 1; + } + + for (pos = query_step->sql_statement, newpos = pos; newpos != NULL; ) + { + pos = newpos; + newpos = strcasestr(pos+1, "OFFSET"); + } + + if (limitpos && limitpos < pos) + pos = limitpos; + + if (IsA(query->limitOffset, Const)) + newLimit += DatumGetInt64(((Const *) query->limitOffset)->constvalue); + else + return 1; + + if (!pos || pos == query_step->sql_statement) + elog(ERROR, "Could not handle LIMIT/OFFSET"); + + newQuery = (char *) palloc(strlen(query_step->sql_statement)+1); + newchar = newQuery; + + /* copy up until position where we found clause */ + for (c = &query_step->sql_statement[0]; c != pos && *c != '\0'; *newchar++ = *c++); + + if (query->limitCount) + sprintf(newchar, "LIMIT %I64d", newLimit); + else + *newchar = '\0'; + + pfree(query_step->sql_statement); + query_step->sql_statement = newQuery; + } + + /* Now add a limit execution node at the top of the plan */ + plan_stmt->planTree = (Plan *) make_limit(plan_stmt->planTree, + query->limitOffset, query->limitCount, 0, 0); + + return 0; +} + + +/* * Build up a QueryPlan to execute on. * * For the prototype, there will only be one step, @@ -1997,6 +2082,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) Plan *standardPlan = result->planTree; RemoteQuery *query_step = makeNode(RemoteQuery); + query_step->is_single_step = false; query_step->sql_statement = pstrdup(query->sql_statement); query_step->exec_nodes = NULL; query_step->combine_type = COMBINE_TYPE_NONE; @@ -2020,21 +2106,6 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("INTO clause not yet supported")))); - - if (query->setOperations) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("UNION, INTERSECT and EXCEPT are not yet supported")))); - - if (query->hasRecursive) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("WITH RECURSIVE not yet supported")))); - - if (query->hasWindowFuncs) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Window functions not yet supported")))); /* fallthru */ case T_InsertStmt: case T_UpdateStmt: @@ -2043,14 +2114,32 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) if (query_step->exec_nodes == NULL) { + /* Do not yet allow multi-node correlated UPDATE or DELETE */ + if ((query->nodeTag == T_UpdateStmt || query->nodeTag == T_DeleteStmt)) + { + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Complex and correlated UPDATE and DELETE not yet supported")))); + } + /* - * Processing guery against catalog tables, restore - * standard plan + * Processing guery against catalog tables, or multi-step command. + * Restore standard plan */ result->planTree = standardPlan; return result; } + /* Do not yet allow multi-node correlated UPDATE or DELETE */ + if ((query->nodeTag == T_UpdateStmt || query->nodeTag == T_DeleteStmt) + && !query_step->exec_nodes + && list_length(query->rtable) > 1) + { + result->planTree = standardPlan; + return result; + } + + query_step->is_single_step = true; /* * PGXCTODO * When Postgres runs insert into t (a) values (1); against table @@ -2064,7 +2153,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) * then call standard planner and take targetList from the plan * generated by Postgres. */ - query_step->plan.targetlist = standardPlan->targetlist; + query_step->scan.plan.targetlist = standardPlan->targetlist; if (query_step->exec_nodes) query_step->combine_type = get_plan_combine_type( @@ -2075,39 +2164,36 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) query_step->simple_aggregates = get_simple_aggregates(query); /* - * Add sortring to the step + * Add sorting to the step */ if (list_length(query_step->exec_nodes->nodelist) > 1 && (query->sortClause || query->distinctClause)) make_simple_sort_from_sortclauses(query, query_step); - /* - * PG-XC cannot yet support some variations of SQL statements. - * We perform some checks to at least catch common cases - */ + /* Handle LIMIT and OFFSET for single-step queries on multiple nodes*/ + if (handle_limit_offset(query_step, query, result)) + { + /* complicated expressions, just fallback to standard plan */ + result->planTree = standardPlan; + return result; + } + /* + * Use standard plan if we have more than one data node with either + * group by, hasWindowFuncs, or hasRecursive + */ /* - * Check if we have multiple nodes and an unsupported clause. This - * is temporary until we expand supported SQL + * PGXCTODO - this could be improved to check if the first + * group by expression is the partitioning column, in which + * case it is ok to treat as a single step. */ - if (query->nodeTag == T_SelectStmt) + if (query->nodeTag == T_SelectStmt + && query_step->exec_nodes + && list_length(query_step->exec_nodes->nodelist) > 1 + && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) { - if (StrictStatementChecking && query_step->exec_nodes - && list_length(query_step->exec_nodes->nodelist) > 1) - { - /* - * PGXCTODO - this could be improved to check if the first - * group by expression is the partitioning column - */ - if (query->groupClause) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Multi-node GROUP BY not yet supported")))); - if (query->limitCount && StrictSelectChecking) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Multi-node LIMIT not yet supported")))); - } + result->planTree = standardPlan; + return result; } break; default: diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile index e875303..c7e950a 100644 --- a/src/backend/pgxc/pool/Makefile +++ b/src/backend/pgxc/pool/Makefile @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/pool top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = datanode.o execRemote.o poolmgr.o poolcomm.o +OBJS = datanode.o execRemote.o poolmgr.o poolcomm.o postgresql_fdw.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 0f16c51..43569e0 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -30,6 +30,8 @@ #include "utils/tuplesort.h" #include "utils/snapmgr.h" +extern char *deparseSql(RemoteQueryState *scanstate); + /* * Buffer size does not affect performance significantly, just do not allow * connection buffer grows infinitely @@ -1461,8 +1463,8 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ { if (need_tran) DataNodeCopyFinish(connections, 0, COMBINE_TYPE_NONE); - else - if (!PersistentConnections) release_handles(); + else if (!PersistentConnections) + release_handles(); } pfree(connections); @@ -1812,21 +1814,44 @@ ExecCountSlotsRemoteQuery(RemoteQuery *node) RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { - RemoteQueryState *remotestate; + RemoteQueryState *remotestate; + Relation currentRelation; + remotestate = CreateResponseCombiner(0, node->combine_type); remotestate->ss.ps.plan = (Plan *) node; remotestate->ss.ps.state = estate; remotestate->simple_aggregates = node->simple_aggregates; + remotestate->ss.ps.qual = (List *) + ExecInitExpr((Expr *) node->scan.plan.qual, + (PlanState *) remotestate); + ExecInitResultTupleSlot(estate, &remotestate->ss.ps); - if (node->plan.targetlist) + if (node->scan.plan.targetlist) { - TupleDesc typeInfo = ExecCleanTypeFromTL(node->plan.targetlist, false); + TupleDesc typeInfo = ExecCleanTypeFromTL(node->scan.plan.targetlist, false); ExecSetSlotDescriptor(remotestate->ss.ps.ps_ResultTupleSlot, typeInfo); } ExecInitScanTupleSlot(estate, &remotestate->ss); + + /* + * Initialize scan relation. get the relation object id from the + * relid'th entry in the range table, open that relation and acquire + * appropriate lock on it. + * This is needed for deparseSQL + * We should remove these lines once we plan and deparse earlier. + */ + if (!node->is_single_step) + { + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid); + remotestate->ss.ss_currentRelation = currentRelation; + ExecAssignScanType(&remotestate->ss, RelationGetDescr(currentRelation)); + } + + remotestate->ss.ps.ps_TupFromTlist = false; + /* * Tuple description for the scan slot will be set on runtime from * a RowDescription message @@ -1991,7 +2016,6 @@ TupleTableSlot * ExecRemoteQuery(RemoteQueryState *node) { RemoteQuery *step = (RemoteQuery *) node->ss.ps.plan; - EState *estate = node->ss.ps.state; TupleTableSlot *resultslot = node->ss.ps.ps_ResultTupleSlot; TupleTableSlot *scanslot = node->ss.ss_ScanTupleSlot; bool have_tuple = false; @@ -2092,6 +2116,11 @@ ExecRemoteQuery(RemoteQueryState *node) data_node_begin(new_count, new_connections, gxid); } + /* Get the SQL string */ + /* only do if not single step */ + if (!step->is_single_step) + step->sql_statement = deparseSql(node); + /* See if we have a primary nodes, execute on it first before the others */ if (primaryconnection) { @@ -2427,12 +2456,35 @@ ExecEndRemoteQuery(RemoteQueryState *node) if (outerPlanState(node)) ExecEndNode(outerPlanState(node)); + if (node->ss.ss_currentRelation) + ExecCloseScanRelation(node->ss.ss_currentRelation); + if (node->tmp_ctx) MemoryContextDelete(node->tmp_ctx); CloseCombiner(node); } + +/* ---------------------------------------------------------------- + * ExecRemoteQueryReScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt) +{ + /* At the moment we materialize results for multi-step queries, + * so no need to support rescan. + // PGXCTODO - rerun Init? + //node->routine->ReOpen(node); + + //ExecScanReScan((ScanState *) node); + */ +} + + /* * Execute utility statement on multiple data nodes * It does approximately the same as diff --git a/src/backend/pgxc/pool/postgresql_fdw.c b/src/backend/pgxc/pool/postgresql_fdw.c new file mode 100644 index 0000000..9e418be --- /dev/null +++ b/src/backend/pgxc/pool/postgresql_fdw.c @@ -0,0 +1,335 @@ +/*------------------------------------------------------------------------- + * + * postgresql_fdw.c + * foreign-data wrapper for PostgreSQL + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_operator.h" +#include "catalog/pg_proc.h" +#include "funcapi.h" +//#include "libpq-fe.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "nodes/makefuncs.h" +#include "optimizer/clauses.h" +#include "parser/scansup.h" +#include "pgxc/execRemote.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + +//#include "dblink.h" + +#define DEBUG_FDW + +/* + * WHERE caluse optimization level + */ +#define EVAL_QUAL_LOCAL 0 /* evaluate none in foreign, all in local */ +#define EVAL_QUAL_BOTH 1 /* evaluate some in foreign, all in local */ +#define EVAL_QUAL_FOREIGN 2 /* evaluate some in foreign, rest in local */ + +#define OPTIMIZE_WHERE_CLAUSE EVAL_QUAL_FOREIGN + + + +/* deparse SQL from the request */ +static bool is_immutable_func(Oid funcid); +static bool is_foreign_qual(ExprState *state); +static bool foreign_qual_walker(Node *node, void *context); +char *deparseSql(RemoteQueryState *scanstate); + + +/* + * Check whether the function is IMMUTABLE. + */ +static bool +is_immutable_func(Oid funcid) +{ + HeapTuple tp; + bool isnull; + Datum datum; + + tp = SearchSysCache(PROCOID, ObjectIdGetDatum(funcid), 0, 0, 0); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + +#ifdef DEBUG_FDW + /* print function name and its immutability */ + { + char *proname; + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_proname, &isnull); + proname = pstrdup(DatumGetName(datum)->data); + elog(DEBUG1, "func %s(%u) is%s immutable", proname, funcid, + (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE) ? "" : " not"); + pfree(proname); + } +#endif + + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_provolatile, &isnull); + ReleaseSysCache(tp); + + return (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE); +} + +/* + * Check whether the ExprState node should be evaluated in foreign server. + * + * An expression which consists of expressions below will be evaluated in + * the foreign server. + * - constant value + * - variable (foreign table column) + * - external parameter (parameter of prepared statement) + * - array + * - bool expression (AND/OR/NOT) + * - NULL test (IS [NOT] NULL) + * - operator + * - IMMUTABLE only + * - It is required that the meaning of the operator be the same as the + * local server in the foreign server. + * - function + * - IMMUTABLE only + * - It is required that the meaning of the operator be the same as the + * local server in the foreign server. + * - scalar array operator (ANY/ALL) + */ +static bool +is_foreign_qual(ExprState *state) +{ + return !foreign_qual_walker((Node *) state->expr, NULL); +} + +/* + * return true if node cannot be evaluatated in foreign server. + */ +static bool +foreign_qual_walker(Node *node, void *context) +{ + if (node == NULL) + return false; + + switch (nodeTag(node)) + { + case T_Param: + /* TODO: pass internal parameters to the foreign server */ + if (((Param *) node)->paramkind != PARAM_EXTERN) + return true; + break; + case T_DistinctExpr: + case T_OpExpr: + /* + * An operator which uses IMMUTABLE function can be evaluated in + * foreign server . It is not necessary to worry about oprrest + * and oprjoin here because they are invoked by planner but not + * executor. DistinctExpr is a typedef of OpExpr. + */ + if (!is_immutable_func(((OpExpr*) node)->opfuncid)) + return true; + break; + case T_ScalarArrayOpExpr: + if (!is_immutable_func(((ScalarArrayOpExpr*) node)->opfuncid)) + return true; + break; + case T_FuncExpr: + /* IMMUTABLE function can be evaluated in foreign server */ + if (!is_immutable_func(((FuncExpr*) node)->funcid)) + return true; + break; + case T_TargetEntry: + case T_PlaceHolderVar: + case T_AppendRelInfo: + case T_PlaceHolderInfo: + /* TODO: research whether those complex nodes are evaluatable. */ + return true; + default: + break; + } + + return expression_tree_walker(node, foreign_qual_walker, context); +} + +/* + * Deparse SQL string from query request. + * + * The expressions in Plan.qual are deparsed when it satisfies is_foreign_qual() + * and removed. + */ +char * +deparseSql(RemoteQueryState *scanstate) +{ + EState *estate = scanstate->ss.ps.state; + bool prefix; + List *context; + StringInfoData sql; + RemoteQuery *scan; + RangeTblEntry *rte; + Oid nspid; + char *nspname; + char *relname; + const char *nspname_q; + const char *relname_q; + const char *aliasname_q; + int i; + TupleDesc tupdesc; + bool first; + +elog(DEBUG2, "%s(%u) called", __FUNCTION__, __LINE__); + + /* extract RemoteQuery and RangeTblEntry */ + scan = (RemoteQuery *)scanstate->ss.ps.plan; + rte = list_nth(estate->es_range_table, scan->scan.scanrelid - 1); + + /* prepare to deparse plan */ + initStringInfo(&sql); + context = deparse_context_for_plan((Node *)scan, NULL, + estate->es_range_table, NULL); + + /* + * Scanning multiple relations in a RemoteQuery node is not supported. + */ + prefix = false; +#if 0 + prefix = list_length(estate->es_range_table) > 1; +#endif + + /* Get quoted names of schema, table and alias */ + nspid = get_rel_namespace(rte->relid); + nspname = get_namespace_name(nspid); + relname = get_rel_name(rte->relid); + nspname_q = quote_identifier(nspname); + relname_q = quote_identifier(relname); + aliasname_q = quote_identifier(rte->eref->aliasname); + + /* deparse SELECT clause */ + appendStringInfo(&sql, "SELECT "); + + /* + * TODO: omit (deparse to "NULL") columns which are not used in the + * original SQL. + * + * We must parse nodes parents of this RemoteQuery node to determine unused + * columns because some columns may be used only in parent Sort/Agg/Limit + * nodes. + */ + tupdesc = scanstate->ss.ss_currentRelation->rd_att; + first = true; + for (i = 0; i < tupdesc->natts; i++) + { + /* skip dropped attributes */ + if (tupdesc->attrs[i]->attisdropped) + continue; + + if (!first) + appendStringInfoString(&sql, ", "); + + if (prefix) + appendStringInfo(&sql, "%s.%s", + aliasname_q, tupdesc->attrs[i]->attname.data); + else + appendStringInfo(&sql, "%s", tupdesc->attrs[i]->attname.data); + first = false; + } + + /* if target list is composed only of system attributes, add dummy column */ + if (first) + appendStringInfo(&sql, "NULL"); + + /* deparse FROM clause */ + appendStringInfo(&sql, " FROM "); + /* + * XXX: should use GENERIC OPTIONS like 'foreign_relname' or something for + * the foreign table name instead of the local name ? + */ + appendStringInfo(&sql, "%s.%s %s", nspname_q, relname_q, aliasname_q); + pfree(nspname); + pfree(relname); + if (nspname_q != nspname_q) + pfree((char *) nspname_q); + if (relname_q != relname_q) + pfree((char *) relname_q); + if (aliasname_q != rte->eref->aliasname) + pfree((char *) aliasname_q); + + /* + * deparse WHERE cluase + * + * The expressions which satisfy is_foreign_qual() are deparsed into WHERE + * clause of result SQL string, and they could be removed from qual of + * PlanState to avoid duplicate evaluation at ExecScan(). + * + * The Plan.qual is never changed, so multiple use of the Plan with + * PREPARE/EXECUTE work properly. + */ +#if OPTIMIZE_WHERE_CLAUSE > EVAL_QUAL_LOCAL + if (scanstate->ss.ps.plan->qual) + { + List *local_qual = NIL; + List *foreign_qual = NIL; + List *foreign_expr = NIL; + ListCell *lc; + + /* + * Divide qual of PlanState into two lists, one for local evaluation + * and one for foreign evaluation. + */ + foreach (lc, scanstate->ss.ps.qual) + { + ExprState *state = lfirst(lc); + + if (is_foreign_qual(state)) + { + elog(DEBUG1, "foreign qual: %s", nodeToString(state->expr)); + foreign_qual = lappend(foreign_qual, state); + foreign_expr = lappend(foreign_expr, state->expr); + } + else + { + elog(DEBUG1, "local qual: %s", nodeToString(state->expr)); + local_qual = lappend(local_qual, state); + } + } +#if OPTIMIZE_WHERE_CLAUSE == EVAL_QUAL_FOREIGN + /* + * If the optimization level is EVAL_QUAL_FOREIGN, replace the original + * qual with the list of ExprStates which should be evaluated in the + * local server. + */ + scanstate->ss.ps.qual = local_qual; +#endif + + /* + * Deparse quals to be evaluated in the foreign server if any. + * TODO: modify deparse_expression() to deparse conditions which use + * internal parameters. + */ + if (foreign_expr != NIL) + { + Node *node; + node = (Node *) make_ands_explicit(foreign_expr); + appendStringInfo(&sql, " WHERE "); + appendStringInfo(&sql, + deparse_expression(node, context, prefix, false)); + /* + * The contents of the list MUST NOT be free-ed because they are + * referenced from Plan.qual list. + */ + list_free(foreign_expr); + } + } +#endif + + elog(DEBUG1, "deparsed SQL is \"%s\"", sql.data); + + return sql.data; +} + diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 608755f..a6f4767 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -198,8 +198,6 @@ static void log_disconnections(int code, Datum arg); #ifdef PGXC /* PGXC_DATANODE */ -static void pgxc_transaction_stmt (Node *parsetree); - /* ---------------------------------------------------------------- * PG-XC routines * ---------------------------------------------------------------- diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d1c01da..0c6208c 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -306,7 +306,6 @@ ProcessUtility(Node *parsetree, case TRANS_STMT_START: { ListCell *lc; - #ifdef PGXC if (IS_PGXC_COORDINATOR) DataNodeBegin(); @@ -329,10 +328,6 @@ ProcessUtility(Node *parsetree, break; case TRANS_STMT_COMMIT: -#ifdef PGXC - if (IS_PGXC_COORDINATOR) - DataNodeCommit(); -#endif if (!EndTransactionBlock()) { /* report unsuccessful commit in completionTag */ @@ -361,10 +356,6 @@ ProcessUtility(Node *parsetree, break; case TRANS_STMT_ROLLBACK: -#ifdef PGXC - if (IS_PGXC_COORDINATOR) - DataNodeBegin(); -#endif UserAbortTransactionBlock(); break; @@ -1055,21 +1046,16 @@ ProcessUtility(Node *parsetree, case T_ExplainStmt: ExplainQuery((ExplainStmt *) parsetree, queryString, params, dest); -#ifdef PGXC - if (IS_PGXC_COORDINATOR) - { - Exec_Nodes *nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); - nodes->nodelist = GetAnyDataNode(); - ExecUtilityStmtOnNodes(queryString, nodes, false); - } -#endif break; case T_VariableSetStmt: ExecSetVariableStmt((VariableSetStmt *) parsetree); #ifdef PGXC +/* PGXCTODO - this currently causes an assertion failure. + We should change when we add SET handling properly if (IS_PGXC_COORDINATOR) ExecUtilityStmtOnNodes(queryString, NULL, false); +*/ #endif break; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 11be226..29aed38 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -79,6 +79,9 @@ extern void cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); +#ifdef PGXC +extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel); +#endif extern void cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm); extern void cost_sort(Path *path, PlannerInfo *root, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 0f4c52e..05efcaf 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -56,6 +56,9 @@ extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel); extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel); extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel); extern Path *create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel); +#ifdef PGXC +extern Path *create_remotequery_path(PlannerInfo *root, RelOptInfo *rel); +#endif extern NestPath *create_nestloop_path(PlannerInfo *root, RelOptInfo *joinrel, diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index b7faa7d..143c8fa 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -95,6 +95,7 @@ extern void ExecRemoteUtility(RemoteQuery *node); extern int handle_response(DataNodeHandle * conn, RemoteQueryState *combiner); extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot); -extern int primary_data_node; +extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt); -#endif \ No newline at end of file +extern int primary_data_node; +#endif diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index bf8f224..346dd65 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -58,7 +58,8 @@ typedef struct */ typedef struct { - Plan plan; + Scan scan; + bool is_single_step; /* special case, skip extra work */ char *sql_statement; Exec_Nodes *exec_nodes; CombineType combine_type; ----------------------------------------------------------------------- Summary of changes: src/backend/commands/explain.c | 31 +++ src/backend/commands/tablecmds.c | 14 ++ src/backend/executor/execAmi.c | 8 + src/backend/executor/nodeMaterial.c | 38 ++++ src/backend/optimizer/path/allpaths.c | 20 ++ src/backend/optimizer/plan/createplan.c | 99 +++++++++ src/backend/optimizer/plan/setrefs.c | 16 ++ src/backend/optimizer/plan/subselect.c | 6 + src/backend/optimizer/util/pathnode.c | 22 ++ src/backend/pgxc/plan/planner.c | 196 +++++++++++++----- src/backend/pgxc/pool/Makefile | 2 +- src/backend/pgxc/pool/execRemote.c | 64 ++++++- src/backend/pgxc/pool/postgresql_fdw.c | 335 +++++++++++++++++++++++++++++++ src/backend/tcop/postgres.c | 2 - src/backend/tcop/utility.c | 20 +-- src/include/optimizer/cost.h | 3 + src/include/optimizer/pathnode.h | 3 + src/include/pgxc/execRemote.h | 5 +- src/include/pgxc/planner.h | 3 +- 19 files changed, 803 insertions(+), 84 deletions(-) create mode 100644 src/backend/pgxc/pool/postgresql_fdw.c hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-08-23 03:55:28
|
Project "Postgres-XC". The branch, master has been updated via cfb29183b57e811e0dfcf3641c5cc58458b2584a (commit) from fbaab7cc05f975cd6339918390fd22360744b08c (commit) - Log ----------------------------------------------------------------- commit cfb29183b57e811e0dfcf3641c5cc58458b2584a Author: M S <masonsharp@mason-sharps-macbook.local> Date: Mon Aug 23 12:53:55 2010 +0900 Portal integration changes. This integrates Postgres-XC code deeper into PostgreSQL. The Extended Query Protocol can now be used, which means that JDBC will now work. It also lays more groundwork for supporting multi-step queries (cross-node joins). Note that statements with parameters cannot yet be prepared and executed, only those without parameters will work. Note also that this patch introduces additional performance degradation because more processing occurs with each request. We will be working to address these issues in the coming weeks. Written by Andrei Martsinchyk diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 08e35ae..657413a 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -179,7 +179,6 @@ typedef struct CopyStateData /* Locator information */ RelationLocInfo *rel_loc; /* the locator key */ int hash_idx; /* index of the hash column */ - bool on_coord; DataNodeHandle **connections; /* Involved data node connections */ #endif @@ -800,31 +799,6 @@ CopyQuoteIdentifier(StringInfo query_buf, char *value) } #endif -#ifdef PGXC -/* - * In case there is no locator info available, copy to/from is launched in portal on coordinator. - * This happens for pg_catalog tables (not user defined ones) - * such as pg_catalog, pg_attribute, etc. - * This part is launched before the portal is activated, so check a first time if there - * some locator data for this relid and if no, return and launch the portal. - */ -bool -IsCoordPortalCopy(const CopyStmt *stmt) -{ - RelationLocInfo *rel_loc; /* the locator key */ - - /* In the case of a COPY SELECT, this is launched on datanodes */ - if(!stmt->relation) - return false; - - rel_loc = GetRelationLocInfo(RangeVarGetRelid(stmt->relation, true)); - - if (!rel_loc) - return true; - - return false; -} -#endif /* * DoCopy executes the SQL COPY statement @@ -857,11 +831,7 @@ IsCoordPortalCopy(const CopyStmt *stmt) * the table or the specifically requested columns. */ uint64 -#ifdef PGXC -DoCopy(const CopyStmt *stmt, const char *queryString, bool exec_on_coord_portal) -#else DoCopy(const CopyStmt *stmt, const char *queryString) -#endif { CopyState cstate; bool is_from = stmt->is_from; @@ -883,16 +853,6 @@ DoCopy(const CopyStmt *stmt, const char *queryString) /* Allocate workspace and zero all fields */ cstate = (CopyStateData *) palloc0(sizeof(CopyStateData)); -#ifdef PGXC - /* - * Copy to/from is initialized as being launched on datanodes - * This functionnality is particularly interesting to have a result for - * tables who have no locator informations such as pg_catalog, pg_class, - * and pg_attribute. - */ - cstate->on_coord = false; -#endif - /* Extract options from the statement node tree */ foreach(option, stmt->options) { @@ -1180,13 +1140,15 @@ DoCopy(const CopyStmt *stmt, const char *queryString) exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + /* + * If target table does not exists on nodes (e.g. system table) + * the location info returned is NULL. This is the criteria, when + * we need to run Copy on coordinator + */ cstate->rel_loc = GetRelationLocInfo(RelationGetRelid(cstate->rel)); - if (exec_on_coord_portal) - cstate->on_coord = true; - hash_att = GetRelationHashColumn(cstate->rel_loc); - if (!cstate->on_coord) + if (cstate->rel_loc) { if (is_from || hash_att) exec_nodes->nodelist = list_copy(cstate->rel_loc->nodeList); @@ -1481,7 +1443,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString) * In the case of CopyOut, it is just necessary to pick up one node randomly. * This is done when rel_loc is found. */ - if (!cstate->on_coord) + if (cstate->rel_loc) { cstate->connections = DataNodeCopyBegin(cstate->query_buf.data, exec_nodes->nodelist, @@ -1506,7 +1468,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString) } PG_CATCH(); { - if (IS_PGXC_COORDINATOR && is_from && !cstate->on_coord) + if (IS_PGXC_COORDINATOR && is_from && cstate->rel_loc) { DataNodeCopyFinish( cstate->connections, @@ -1519,18 +1481,13 @@ DoCopy(const CopyStmt *stmt, const char *queryString) PG_RE_THROW(); } PG_END_TRY(); - if (IS_PGXC_COORDINATOR && is_from && !cstate->on_coord) + if (IS_PGXC_COORDINATOR && is_from && cstate->rel_loc) { - if (cstate->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED) - cstate->processed = DataNodeCopyFinish( - cstate->connections, - primary_data_node, - COMBINE_TYPE_SAME); - else - cstate->processed = DataNodeCopyFinish( - cstate->connections, - 0, - COMBINE_TYPE_SUM); + bool replicated = cstate->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED; + DataNodeCopyFinish( + cstate->connections, + replicated ? primary_data_node : 0, + replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM); pfree(cstate->connections); pfree(cstate->query_buf.data); FreeRelationLocInfo(cstate->rel_loc); @@ -1770,7 +1727,7 @@ CopyTo(CopyState cstate) } #ifdef PGXC - if (IS_PGXC_COORDINATOR && !cstate->on_coord) + if (IS_PGXC_COORDINATOR && cstate->rel_loc) { cstate->processed = DataNodeCopyOut( GetRelationNodes(cstate->rel_loc, NULL, true), @@ -2480,7 +2437,7 @@ CopyFrom(CopyState cstate) } #ifdef PGXC - if (IS_PGXC_COORDINATOR && !cstate->on_coord) + if (IS_PGXC_COORDINATOR && cstate->rel_loc) { Datum *hash_value = NULL; @@ -2494,6 +2451,7 @@ CopyFrom(CopyState cstate) ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Copy failed on a data node"))); + cstate->processed++; } else { diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 131be22..847b556 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -858,6 +858,14 @@ InitPlan(QueryDesc *queryDesc, int eflags) { case CMD_SELECT: case CMD_INSERT: +#ifdef PGXC + /* + * PGXC RemoteQuery do not require ctid junk field, so follow + * standard procedure for UPDATE and DELETE + */ + case CMD_UPDATE: + case CMD_DELETE: +#endif foreach(tlist, plan->targetlist) { TargetEntry *tle = (TargetEntry *) lfirst(tlist); @@ -869,10 +877,12 @@ InitPlan(QueryDesc *queryDesc, int eflags) } } break; +#ifndef PGXC case CMD_UPDATE: case CMD_DELETE: junk_filter_needed = true; break; +#endif default: break; } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 15af711..1affd6c 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -108,7 +108,9 @@ #include "executor/nodeWindowAgg.h" #include "executor/nodeWorktablescan.h" #include "miscadmin.h" - +#ifdef PGXC +#include "pgxc/execRemote.h" +#endif /* ------------------------------------------------------------------------ * ExecInitNode @@ -286,6 +288,13 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; +#ifdef PGXC + case T_RemoteQuery: + result = (PlanState *) ExecInitRemoteQuery((RemoteQuery *) node, + estate, eflags); + break; +#endif + default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); result = NULL; /* keep compiler quiet */ @@ -451,6 +460,12 @@ ExecProcNode(PlanState *node) result = ExecLimit((LimitState *) node); break; +#ifdef PGXC + case T_RemoteQueryState: + result = ExecRemoteQuery((RemoteQueryState *) node); + break; +#endif + default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); result = NULL; @@ -627,6 +642,11 @@ ExecCountSlotsNode(Plan *node) case T_Limit: return ExecCountSlotsLimit((Limit *) node); +#ifdef PGXC + case T_RemoteQuery: + return ExecCountSlotsRemoteQuery((RemoteQuery *) node); +#endif + default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); break; @@ -783,6 +803,12 @@ ExecEndNode(PlanState *node) ExecEndLimit((LimitState *) node); break; +#ifdef PGXC + case T_RemoteQueryState: + ExecEndRemoteQuery((RemoteQueryState *) node); + break; +#endif + default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); break; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 0a8d783..8dd924d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -38,6 +38,10 @@ #include "parser/parse_expr.h" #include "parser/parse_oper.h" #include "parser/parsetree.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "pgxc/planner.h" +#endif #include "utils/lsyscache.h" #include "utils/syscache.h" @@ -119,7 +123,12 @@ planner(Query *parse, int cursorOptions, ParamListInfo boundParams) if (planner_hook) result = (*planner_hook) (parse, cursorOptions, boundParams); else - result = standard_planner(parse, cursorOptions, boundParams); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + result = pgxc_planner(parse, cursorOptions, boundParams); + else +#endif + result = standard_planner(parse, cursorOptions, boundParams); return result; } diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index b5be190..5b2e03f 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -39,6 +39,11 @@ #include "parser/parse_target.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "pgxc/planner.h" +#include "tcop/tcopprot.h" +#endif #include "utils/rel.h" @@ -60,6 +65,10 @@ static Query *transformDeclareCursorStmt(ParseState *pstate, DeclareCursorStmt *stmt); static Query *transformExplainStmt(ParseState *pstate, ExplainStmt *stmt); +#ifdef PGXC +static Query *transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt); +#endif + static void transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc); static bool check_parameter_resolution_walker(Node *node, ParseState *pstate); @@ -206,6 +215,13 @@ transformStmt(ParseState *pstate, Node *parseTree) (ExplainStmt *) parseTree); break; +#ifdef PGXC + case T_ExecDirectStmt: + result = transformExecDirectStmt(pstate, + (ExecDirectStmt *) parseTree); + break; +#endif + default: /* @@ -270,6 +286,17 @@ analyze_requires_snapshot(Node *parseTree) result = true; break; +#ifdef PGXC + case T_ExecDirectStmt: + + /* + * We will parse/analyze/plan inner query, which probably will + * need a snapshot. Ensure it is set. + */ + result = true; + break; +#endif + default: /* utility statements don't have any active parse analysis */ result = false; @@ -2025,6 +2052,25 @@ transformExplainStmt(ParseState *pstate, ExplainStmt *stmt) return result; } +#ifdef PGXC +/* + * transformExecDirectStmt - + * transform an EXECUTE DIRECT Statement + * + * Handling is depends if we should execute on nodes or on coordinator. + * To execute on nodes we return CMD_UTILITY query having one T_RemoteQuery node + * with the inner statement as a sql_command. + * If statement is to run on coordinator we should parse inner statement and + * analyze resulting query tree. + */ +static Query * +transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Support for EXECUTE DIRECT is temporary broken"))); +} +#endif /* exported so planner can check again after rewriting, query pullup, etc */ void diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 2608a3f..f47cc6a 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -52,6 +52,7 @@ #ifdef PGXC #include "pgxc/locator.h" #include "pgxc/pgxc.h" +#include "pgxc/planner.h" #endif #include "rewrite/rewriteManip.h" @@ -261,9 +262,9 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) result = list_concat(result, save_alist); #ifdef PGXC - /* - * If the user did not specify any distribution clause and there is no - * inherits clause, try and use PK or unique index + /* + * If the user did not specify any distribution clause and there is no + * inherits clause, try and use PK or unique index */ if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col) { @@ -271,6 +272,13 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) stmt->distributeby->disttype = DISTTYPE_HASH; stmt->distributeby->colname = cxt.fallback_dist_col; } + if (IS_PGXC_COORDINATOR) + { + RemoteQuery *step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->sql_statement = queryString; + result = lappend(result, step); + } #endif return result; } @@ -1171,7 +1179,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) { if (cxt->distributeby) isLocalSafe = CheckLocalIndexColumn ( - ConvertToLocatorType(cxt->distributeby->disttype), + ConvertToLocatorType(cxt->distributeby->disttype), cxt->distributeby->colname, key); } #endif @@ -1273,7 +1281,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) { /* * Set fallback distribution column. - * If not set, set it to first column in index. + * If not set, set it to first column in index. * If primary key, we prefer that over a unique constraint. */ if (index->indexParams == NIL @@ -1281,7 +1289,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) { cxt->fallback_dist_col = pstrdup(key); } - + /* Existing table, check if it is safe */ if (!cxt->distributeby && !isLocalSafe) isLocalSafe = CheckLocalIndexColumn ( @@ -1299,7 +1307,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) index->indexParams = lappend(index->indexParams, iparam); } #ifdef PGXC - if (IS_PGXC_COORDINATOR && cxt->distributeby + if (IS_PGXC_COORDINATOR && cxt->distributeby && cxt->distributeby->disttype == DISTTYPE_HASH && !isLocalSafe) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), @@ -1618,7 +1626,7 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString, ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("Rule may not use NOTIFY, it is not yet supported"))); - + #endif /* * Since outer ParseState isn't parent of inner, have to pass down @@ -1956,7 +1964,15 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString) result = lappend(cxt.blist, stmt); result = list_concat(result, cxt.alist); result = list_concat(result, save_alist); - +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + RemoteQuery *step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->sql_statement = queryString; + result = lappend(result, step); + } +#endif return result; } diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 002e710..1dcfc29 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -25,6 +25,7 @@ #include "nodes/nodes.h" #include "nodes/parsenodes.h" #include "optimizer/clauses.h" +#include "optimizer/planner.h" #include "optimizer/tlist.h" #include "parser/parse_agg.h" #include "parser/parse_coerce.h" @@ -116,7 +117,7 @@ typedef struct ColumnBase */ typedef struct XCWalkerContext { - Query *query; + Query *query; bool isRead; Exec_Nodes *exec_nodes; /* resulting execution nodes */ Special_Conditions *conditions; @@ -125,6 +126,7 @@ typedef struct XCWalkerContext int varno; bool within_or; bool within_not; + bool exec_on_coord; /* fallback to standard planner to have plan executed on coordinator only */ List *join_list; /* A list of List*'s, one for each relation. */ } XCWalkerContext; @@ -971,6 +973,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) /* just pg_catalog tables */ context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; + context->exec_on_coord = true; return false; } @@ -1087,6 +1090,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) { context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; + context->exec_on_coord = true; return false; } @@ -1253,7 +1257,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) static Exec_Nodes * get_plan_nodes(Query *query, bool isRead) { - Exec_Nodes *result_nodes; + Exec_Nodes *result_nodes = NULL; XCWalkerContext context; @@ -1267,13 +1271,16 @@ get_plan_nodes(Query *query, bool isRead) context.varno = 0; context.within_or = false; context.within_not = false; + context.exec_on_coord = false; context.join_list = NIL; - if (get_plan_nodes_walker((Node *) query, &context)) - result_nodes = NULL; - else + if (!get_plan_nodes_walker((Node *) query, &context)) result_nodes = context.exec_nodes; - + if (context.exec_on_coord && result_nodes) + { + pfree(result_nodes); + result_nodes = NULL; + } free_special_relations(context.conditions); free_join_list(context.join_list); return result_nodes; @@ -1976,68 +1983,89 @@ make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) * For the prototype, there will only be one step, * and the nodelist will be NULL if it is not a PGXC-safe statement. */ -Query_Plan * -GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) +PlannedStmt * +pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) { - Query_Plan *query_plan = palloc(sizeof(Query_Plan)); + /* + * We waste some time invoking standard planner, but getting good enough + * PlannedStmt, we just need to replace standard plan. + * In future we may want to skip the standard_planner invocation and + * initialize the PlannedStmt here. At the moment not all queries works: + * ex. there was a problem with INSERT into a subset of table columns + */ + PlannedStmt *result = standard_planner(query, cursorOptions, boundParams); + Plan *standardPlan = result->planTree; RemoteQuery *query_step = makeNode(RemoteQuery); - Query *query; - query_step->sql_statement = (char *) palloc(strlen(sql_statement) + 1); - strcpy(query_step->sql_statement, sql_statement); + query_step->sql_statement = pstrdup(query->sql_statement); query_step->exec_nodes = NULL; query_step->combine_type = COMBINE_TYPE_NONE; query_step->simple_aggregates = NULL; - query_step->read_only = false; + /* Optimize multi-node handling */ + query_step->read_only = query->nodeTag == T_SelectStmt; query_step->force_autocommit = false; - query_plan->query_step_list = lappend(NULL, query_step); + result->planTree = (Plan *) query_step; /* * Determine where to execute the command, either at the Coordinator * level, Data Nodes, or both. By default we choose both. We should be * able to quickly expand this for more commands. */ - switch (nodeTag(parsetree)) + switch (query->nodeTag) { case T_SelectStmt: - /* Optimize multi-node handling */ - query_step->read_only = true; + /* Perform some checks to make sure we can support the statement */ + if (query->intoClause) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("INTO clause not yet supported")))); + + if (query->setOperations) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("UNION, INTERSECT and EXCEPT are not yet supported")))); + + if (query->hasRecursive) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("WITH RECURSIVE not yet supported")))); + + if (query->hasWindowFuncs) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Window functions not yet supported")))); /* fallthru */ case T_InsertStmt: case T_UpdateStmt: case T_DeleteStmt: - /* just use first one in querytree_list */ - query = (Query *) linitial(querytree_list); - /* should copy instead ? */ - query_step->plan.targetlist = query->targetList; + query_step->exec_nodes = get_plan_nodes_command(query); - /* Perform some checks to make sure we can support the statement */ - if (nodeTag(parsetree) == T_SelectStmt) + if (query_step->exec_nodes == NULL) { - if (query->intoClause) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("INTO clause not yet supported")))); - - if (query->setOperations) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("UNION, INTERSECT and EXCEPT are not yet supported")))); - - if (query->hasRecursive) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("WITH RECURSIVE not yet supported")))); - - if (query->hasWindowFuncs) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Window functions not yet supported")))); + /* + * Processing guery against catalog tables, restore + * standard plan + */ + result->planTree = standardPlan; + return result; } - query_step->exec_nodes = - get_plan_nodes_command(query); + /* + * PGXCTODO + * When Postgres runs insert into t (a) values (1); against table + * defined as create table t (a int, b int); the plan is looking + * like insert into t (a,b) values (1,null); + * Later executor is verifying plan, to make sure table has not + * been altered since plan has been created and comparing table + * definition with plan target list and output error if they do + * not match. + * I could not find better way to generate targetList for pgxc plan + * then call standard planner and take targetList from the plan + * generated by Postgres. + */ + query_step->plan.targetlist = standardPlan->targetlist; + if (query_step->exec_nodes) query_step->combine_type = get_plan_combine_type( query, query_step->exec_nodes->baselocatortype); @@ -2047,37 +2075,9 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) query_step->simple_aggregates = get_simple_aggregates(query); /* - * See if it is a SELECT with no relations, like SELECT 1+1 or - * SELECT nextval('fred'), and just use coord. - */ - if (query_step->exec_nodes == NULL - && (query->jointree->fromlist == NULL - || query->jointree->fromlist->length == 0)) - /* Just execute it on Coordinator */ - query_plan->exec_loc_type = EXEC_ON_COORD; - else - { - if (query_step->exec_nodes != NULL - && query_step->exec_nodes->tableusagetype == TABLE_USAGE_TYPE_PGCATALOG) - { - /* pg_catalog query, run on coordinator */ - query_plan->exec_loc_type = EXEC_ON_COORD; - } - else - { - query_plan->exec_loc_type = EXEC_ON_DATA_NODES; - - /* If node list is NULL, execute on coordinator */ - if (!query_step->exec_nodes) - query_plan->exec_loc_type = EXEC_ON_COORD; - } - } - - /* * Add sortring to the step */ - if (query_plan->exec_loc_type == EXEC_ON_DATA_NODES && - list_length(query_step->exec_nodes->nodelist) > 1 && + if (list_length(query_step->exec_nodes->nodelist) > 1 && (query->sortClause || query->distinctClause)) make_simple_sort_from_sortclauses(query, query_step); @@ -2090,7 +2090,7 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) * Check if we have multiple nodes and an unsupported clause. This * is temporary until we expand supported SQL */ - if (nodeTag(parsetree) == T_SelectStmt) + if (query->nodeTag == T_SelectStmt) { if (StrictStatementChecking && query_step->exec_nodes && list_length(query_step->exec_nodes->nodelist) > 1) @@ -2110,180 +2110,6 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) } } break; - - /* Statements that we only want to execute on the Coordinator */ - case T_VariableShowStmt: - query_plan->exec_loc_type = EXEC_ON_COORD; - break; - - /* - * Statements that need to run in autocommit mode, on Coordinator - * and Data Nodes with suppressed implicit two phase commit. - */ - case T_CheckPointStmt: - case T_ClusterStmt: - case T_CreatedbStmt: - case T_DropdbStmt: - case T_VacuumStmt: - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - query_step->force_autocommit = true; - break; - - case T_DropPropertyStmt: - /* - * Triggers are not yet supported by PGXC - * all other queries are executed on both Coordinator and Datanode - * On the same point, assert also is not supported - */ - if (((DropPropertyStmt *)parsetree)->removeType == OBJECT_TRIGGER) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("This command is not yet supported.")))); - else - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - break; - - case T_CreateStmt: - if (((CreateStmt *)parsetree)->relation->istemp) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Temp tables are not yet supported.")))); - - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - break; - - /* - * Statements that we execute on both the Coordinator and Data Nodes - */ - case T_AlterDatabaseStmt: - case T_AlterDatabaseSetStmt: - case T_AlterDomainStmt: - case T_AlterFdwStmt: - case T_AlterForeignServerStmt: - case T_AlterFunctionStmt: - case T_AlterObjectSchemaStmt: - case T_AlterOpFamilyStmt: - case T_AlterSeqStmt: - case T_AlterTableStmt: /* Can also be used to rename a sequence */ - case T_AlterTSConfigurationStmt: - case T_AlterTSDictionaryStmt: - case T_ClosePortalStmt: /* In case CLOSE ALL is issued */ - case T_CommentStmt: - case T_CompositeTypeStmt: - case T_ConstraintsSetStmt: - case T_CreateCastStmt: - case T_CreateConversionStmt: - case T_CreateDomainStmt: - case T_CreateEnumStmt: - case T_CreateFdwStmt: - case T_CreateForeignServerStmt: - case T_CreateFunctionStmt: /* Only global functions are supported */ - case T_CreateOpClassStmt: - case T_CreateOpFamilyStmt: - case T_CreatePLangStmt: - case T_CreateSeqStmt: - case T_CreateSchemaStmt: - case T_DeallocateStmt: /* Allow for DEALLOCATE ALL */ - case T_DiscardStmt: - case T_DropCastStmt: - case T_DropFdwStmt: - case T_DropForeignServerStmt: - case T_DropPLangStmt: - case T_DropStmt: - case T_IndexStmt: - case T_LockStmt: - case T_ReindexStmt: - case T_RemoveFuncStmt: - case T_RemoveOpClassStmt: - case T_RemoveOpFamilyStmt: - case T_RenameStmt: - case T_RuleStmt: - case T_TruncateStmt: - case T_VariableSetStmt: - case T_ViewStmt: - - /* - * Also support these, should help later with pg_restore, although - * not very useful because of the pooler using the same user - */ - case T_GrantStmt: - case T_GrantRoleStmt: - case T_CreateRoleStmt: - case T_AlterRoleStmt: - case T_AlterRoleSetStmt: - case T_AlterUserMappingStmt: - case T_CreateUserMappingStmt: - case T_DropRoleStmt: - case T_AlterOwnerStmt: - case T_DropOwnedStmt: - case T_DropUserMappingStmt: - case T_ReassignOwnedStmt: - case T_DefineStmt: /* used for aggregates, some types */ - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - break; - - case T_TransactionStmt: - switch (((TransactionStmt *) parsetree)->kind) - { - case TRANS_STMT_SAVEPOINT: - case TRANS_STMT_RELEASE: - case TRANS_STMT_ROLLBACK_TO: - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("This type of transaction statement not yet supported")))); - break; - - default: - break; /* keep compiler quiet */ - } - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - break; - - /* - * For now, pick one of the data nodes until we modify real - * planner It will give an approximate idea of what an isolated - * data node will do - */ - case T_ExplainStmt: - if (((ExplainStmt *) parsetree)->analyze) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("ANALYZE with EXPLAIN is currently not supported.")))); - - query_step->exec_nodes = palloc0(sizeof(Exec_Nodes)); - query_step->exec_nodes->nodelist = GetAnyDataNode(); - query_step->exec_nodes->baselocatortype = LOCATOR_TYPE_RROBIN; - query_plan->exec_loc_type = EXEC_ON_DATA_NODES; - break; - - /* - * Trigger queries are not yet supported by PGXC. - * Tablespace queries are also not yet supported. - * Two nodes on the same servers cannot use the same tablespace. - */ - case T_CreateTableSpaceStmt: - case T_CreateTrigStmt: - case T_DropTableSpaceStmt: - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("This command is not yet supported.")))); - break; - - /* - * Other statements we do not yet want to handle. - * By default they would be fobidden, but we list these for reference. - * Note that there is not a 1-1 correspndence between - * SQL command and the T_*Stmt structures. - */ - case T_DeclareCursorStmt: - case T_ExecuteStmt: - case T_FetchStmt: - case T_ListenStmt: - case T_LoadStmt: - case T_NotifyStmt: - case T_PrepareStmt: - case T_UnlistenStmt: - /* fall through */ default: /* Allow for override */ if (StrictStatementChecking) @@ -2291,12 +2117,10 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("This command is not yet supported.")))); else - query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - break; + result->planTree = standardPlan; } - - return query_plan; + return result; } @@ -2321,21 +2145,3 @@ free_query_step(RemoteQuery *query_step) list_free_deep(query_step->simple_aggregates); pfree(query_step); } - -/* - * Free Query_Plan struct - */ -void -FreeQueryPlan(Query_Plan *query_plan) -{ - ListCell *item; - - if (query_plan == NULL) - return; - - foreach(item, query_plan->query_step_list) - free_query_step((RemoteQuery *) lfirst(item)); - - pfree(query_plan->query_step_list); - pfree(query_plan); -} diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index bbedef0..0f16c51 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -168,9 +168,7 @@ CreateResponseCombiner(int node_count, CombineType combine_type) combiner->connections = NULL; combiner->conn_count = 0; combiner->combine_type = combine_type; - combiner->dest = NULL; combiner->command_complete_count = 0; - combiner->row_count = 0; combiner->request_type = REQUEST_TYPE_NOT_DEFINED; combiner->tuple_desc = NULL; combiner->description_count = 0; @@ -178,7 +176,6 @@ CreateResponseCombiner(int node_count, CombineType combine_type) combiner->copy_out_count = 0; combiner->errorMessage = NULL; combiner->query_Done = false; - combiner->completionTag = NULL; combiner->msg = NULL; combiner->msglen = 0; combiner->initAggregates = true; @@ -488,7 +485,8 @@ HandleCopyOutComplete(RemoteQueryState *combiner) static void HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len) { - int digits = 0; + int digits = 0; + EState *estate = combiner->ss.ps.state; /* * If we did not receive description we are having rowcount or OK response @@ -496,7 +494,7 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len) if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_COMMAND; /* Extract rowcount */ - if (combiner->combine_type != COMBINE_TYPE_NONE) + if (combiner->combine_type != COMBINE_TYPE_NONE && estate) { uint64 rowcount; digits = parse_row_count(msg_body, len, &rowcount); @@ -507,7 +505,7 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len) { if (combiner->command_complete_count) { - if (rowcount != combiner->row_count) + if (rowcount != estate->es_processed) /* There is a consistency issue in the database with the replicated table */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -515,37 +513,15 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len) } else /* first result */ - combiner->row_count = rowcount; + estate->es_processed = rowcount; } else - combiner->row_count += rowcount; + estate->es_processed += rowcount; } else combiner->combine_type = COMBINE_TYPE_NONE; } - if (++combiner->command_complete_count == combiner->node_count) - { - if (combiner->completionTag) - { - if (combiner->combine_type == COMBINE_TYPE_NONE) - { - /* ensure we do not go beyond buffer bounds */ - if (len > COMPLETION_TAG_BUFSIZE) - len = COMPLETION_TAG_BUFSIZE; - memcpy(combiner->completionTag, msg_body, len); - } - else - { - /* Truncate msg_body to get base string */ - msg_body[len - digits - 1] = '\0'; - snprintf(combiner->completionTag, - COMPLETION_TAG_BUFSIZE, - "%s" UINT64_FORMAT, - msg_body, - combiner->row_count); - } - } - } + combiner->command_complete_count++; } /* @@ -653,6 +629,9 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) (errcode(ERRCODE_DATA_CORRUPTED), errmsg("Unexpected response from the data nodes for 'd' message, current request type %d", combiner->request_type))); + /* count the row */ + combiner->processed++; + /* If there is a copy file, data has to be sent to the local file */ if (combiner->copy_file) /* write data to the copy file */ @@ -881,7 +860,6 @@ ValidateAndResetCombiner(RemoteQueryState *combiner) combiner->command_complete_count = 0; combiner->connections = NULL; combiner->conn_count = 0; - combiner->row_count = 0; combiner->request_type = REQUEST_TYPE_NOT_DEFINED; combiner->tuple_desc = NULL; combiner->description_count = 0; @@ -1106,7 +1084,6 @@ data_node_begin(int conn_count, DataNodeHandle ** connections, } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; /* Receive responses */ if (data_node_receive_responses(conn_count, connections, timeout, combiner)) @@ -1225,7 +1202,6 @@ data_node_commit(int conn_count, DataNodeHandle ** connections) } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; /* Receive responses */ if (data_node_receive_responses(conn_count, connections, timeout, combiner)) result = EOF; @@ -1268,10 +1244,7 @@ data_node_commit(int conn_count, DataNodeHandle ** connections) } if (!combiner) - { combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; - } /* Receive responses */ if (data_node_receive_responses(conn_count, connections, timeout, combiner)) result = EOF; @@ -1336,7 +1309,6 @@ data_node_rollback(int conn_count, DataNodeHandle ** connections) } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; /* Receive responses */ if (data_node_receive_responses(conn_count, connections, timeout, combiner)) return EOF; @@ -1480,7 +1452,6 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ * client runs console or file copy */ combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; /* Receive responses */ if (data_node_receive_responses(conn_count, connections, timeout, combiner) @@ -1541,7 +1512,6 @@ DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** if (primary_handle->inStart < primary_handle->inEnd) { RemoteQueryState *combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; handle_response(primary_handle, combiner); if (!ValidateAndCloseCombiner(combiner)) return EOF; @@ -1603,7 +1573,6 @@ DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** if (handle->inStart < handle->inEnd) { RemoteQueryState *combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE); - combiner->dest = None_Receiver; handle_response(handle, combiner); if (!ValidateAndCloseCombiner(combiner)) return EOF; @@ -1670,13 +1639,13 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* bool need_tran; List *nodelist; ListCell *nodeitem; - uint64 processed = 0; + uint64 processed; nodelist = exec_nodes->nodelist; need_tran = !autocommit || conn_count > 1; combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM); - combiner->dest = None_Receiver; + combiner->processed = 0; /* If there is an existing file where to copy data, pass it to combiner */ if (copy_file) combiner->copy_file = copy_file; @@ -1712,7 +1681,7 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* } } - processed = combiner->row_count; + processed = combiner->processed; if (!ValidateAndCloseCombiner(combiner)) { @@ -1730,7 +1699,7 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* /* * Finish copy process on all connections */ -uint64 +void DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, CombineType combine_type) { @@ -1743,7 +1712,6 @@ DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, DataNodeHandle *connections[NumDataNodes]; DataNodeHandle *primary_handle = NULL; int conn_count = 0; - uint64 processed; for (i = 0; i < NumDataNodes; i++) { @@ -1786,8 +1754,7 @@ DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, } combiner = CreateResponseCombiner(conn_count + 1, combine_type); - combiner->dest = None_Receiver; - error = data_node_receive_responses(1, &primary_handle, timeout, combiner) || error; + error = (data_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error; } for (i = 0; i < conn_count; i++) @@ -1823,22 +1790,25 @@ DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, need_tran = !autocommit || primary_handle || conn_count > 1; if (!combiner) - { combiner = CreateResponseCombiner(conn_count, combine_type); - combiner->dest = None_Receiver; - } error = (data_node_receive_responses(conn_count, connections, timeout, combiner) != 0) || error; - processed = combiner->row_count; - if (!ValidateAndCloseCombiner(combiner) || error) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Error while running COPY"))); +} - return processed; +#define REMOTE_QUERY_NSLOTS 2 +int +ExecCountSlotsRemoteQuery(RemoteQuery *node) +{ + return ExecCountSlotsNode(outerPlan((Plan *) node)) + + ExecCountSlotsNode(innerPlan((Plan *) node)) + + REMOTE_QUERY_NSLOTS; } + RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { @@ -1876,6 +1846,9 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } + if (outerPlan(node)) + outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate, eflags); + return remotestate; } @@ -1927,6 +1900,83 @@ copy_slot(RemoteQueryState *node, TupleTableSlot *src, TupleTableSlot *dst) } } +static void +get_exec_connections(Exec_Nodes *exec_nodes, + int *regular_conn_count, + int *total_conn_count, + DataNodeHandle ***connections, + DataNodeHandle ***primaryconnection) +{ + List *nodelist = NIL; + List *primarynode = NIL; + + if (exec_nodes) + { + nodelist = exec_nodes->nodelist; + primarynode = exec_nodes->primarynodelist; + } + + if (list_length(nodelist) == 0) + { + if (primarynode) + *regular_conn_count = NumDataNodes - 1; + else + *regular_conn_count = NumDataNodes; + } + else + { + *regular_conn_count = list_length(nodelist); + } + + *total_conn_count = *regular_conn_count; + + /* Get connection for primary node, if used */ + if (primarynode) + { + *primaryconnection = get_handles(primarynode); + if (!*primaryconnection) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not obtain connection from pool"))); + (*total_conn_count)++; + } + + /* Get other connections (non-primary) */ + *connections = get_handles(nodelist); + if (!*connections) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not obtain connection from pool"))); + +} + +/* + * We would want to run 2PC if current transaction modified more then + * one node. So optimize little bit and do not look further if we + * already have more then one write nodes. + */ +static void +register_write_nodes(int conn_count, DataNodeHandle **connections) +{ + int i, j; + + for (i = 0; i < conn_count && write_node_count < 2; i++) + { + bool found = false; + + for (j = 0; j < write_node_count && !found; j++) + { + if (write_node_list[j] == connections[i]) + found = true; + } + if (!found) + { + /* Add to transaction wide-list */ + write_node_list[write_node_count++] = connections[i]; + } + } +} + /* * Execute step of PGXC plan. * The step specifies a command to be executed on specified nodes. @@ -1950,66 +2000,51 @@ ExecRemoteQuery(RemoteQueryState *node) if (!node->query_Done) { /* First invocation, initialize */ - Exec_Nodes *exec_nodes = step->exec_nodes; bool force_autocommit = step->force_autocommit; bool is_read_only = step->read_only; GlobalTransactionId gxid = InvalidGlobalTransactionId; Snapshot snapshot = GetActiveSnapshot(); DataNodeHandle **connections = NULL; DataNodeHandle **primaryconnection = NULL; - List *nodelist = NIL; - List *primarynode = NIL; int i; - int j; int regular_conn_count; int total_conn_count; bool need_tran; - if (exec_nodes) - { - nodelist = exec_nodes->nodelist; - primarynode = exec_nodes->primarynodelist; - } - - if (list_length(nodelist) == 0) - { - if (primarynode) - regular_conn_count = NumDataNodes - 1; - else - regular_conn_count = NumDataNodes; - } - else + /* + * If coordinator plan is specified execute it first. + * If the plan is returning we are returning these tuples immediately. + * If it is not returning or returned them all by current invocation + * we will go ahead and execute remote query. Then we will never execute + * the outer plan again because node->query_Done flag will be set and + * execution won't get to that place. + */ + if (outerPlanState(node)) { - regular_conn_count = list_length(nodelist); + TupleTableSlot *slot = ExecProcNode(outerPlanState(node)); + if (!TupIsNull(slot)) + return slot; } - total_conn_count = regular_conn_count; - node->node_count = total_conn_count; + get_exec_connections(step->exec_nodes, + ®ular_conn_count, + &total_conn_count, + &connections, + &primaryconnection); - /* Get connection for primary node, if used */ - if (primarynode) - { - primaryconnection = get_handles(primarynode); - if (!primaryconnection) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not obtain connection from pool"))); - total_conn_count++; - } - - /* Get other connections (non-primary) */ - connections = get_handles(nodelist); - if (!connections) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not obtain connection from pool"))); + /* + * We save only regular connections, at the time we exit the function + * we finish with the primary connection and deal only with regular + * connections on subsequent invocations + */ + node->node_count = regular_conn_count; if (force_autocommit) need_tran = false; else need_tran = !autocommit || total_conn_count > 1; - elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, statement_need_tran = %s", autocommit ? "true" : "false", primarynode ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); + elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, need_tran = %s", autocommit ? "true" : "false", primaryconnection ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); stat_statement(); if (autocommit) @@ -2019,44 +2054,11 @@ ExecRemoteQuery(RemoteQueryState *node) clear_write_node_list(); } - /* Check status of connections */ - /* - * We would want to run 2PC if current transaction modified more then - * one node. So optimize little bit and do not look further if we - * already have two. - */ - if (!is_read_only && write_node_count < 2) + if (!is_read_only) { - bool found; - if (primaryconnection) - { - found = false; - for (j = 0; j < write_node_count && !found; j++) - { - if (write_node_list[j] == primaryconnection[0]) - found = true; - } - if (!found) - { - /* Add to transaction wide-list */ - write_node_list[write_node_count++] = primaryconnection[0]; - } - } - for (i = 0; i < regular_conn_count && write_node_count < 2; i++) - { - found = false; - for (j = 0; j < write_node_count && !found; j++) - { - if (write_node_list[j] == connections[i]) - found = true; - } - if (!found) - { - /* Add to transaction wide-list */ - write_node_list[write_node_count++] = connections[i]; - } - } + register_write_nodes(1, primaryconnection); + register_write_nodes(regular_conn_count, connections); } gxid = GetCurrentGlobalTransactionId(); @@ -2209,12 +2211,10 @@ ExecRemoteQuery(RemoteQueryState *node) { ExecSetSlotDescriptor(scanslot, node->tuple_desc); /* - * we should send to client not the tuple_desc we just - * received, but tuple_desc from the planner. - * Data node may be sending junk columns for sorting + * Now tuple table slot is responcible for freeing the + * descriptor */ - (*node->dest->rStartup) (node->dest, CMD_SELECT, - resultslot->tts_tupleDescriptor); + node->tuple_desc = NULL; if (step->sort) { SimpleSort *sort = step->sort; @@ -2228,7 +2228,7 @@ ExecRemoteQuery(RemoteQueryState *node) * be initialized */ node->tuplesortstate = tuplesort_begin_merge( - node->tuple_desc, + scanslot->tts_tupleDescriptor, sort->numCols, sort->sortColIdx, sort->sortOperators, @@ -2290,7 +2290,6 @@ ExecRemoteQuery(RemoteQueryState *node) } } copy_slot(node, scanslot, resultslot); - (*node->dest->receiveSlot) (resultslot, node->dest); break; } if (!have_tuple) @@ -2310,12 +2309,26 @@ ExecRemoteQuery(RemoteQueryState *node) { if (node->simple_aggregates) { - /* - * Advance aggregate functions and allow to read up next - * data row message and get tuple in the same slot on - * next iteration - */ - exec_simple_aggregates(node, scanslot); + if (node->simple_aggregates) + { + /* + * Advance aggregate functions and allow to read up next + * data row message and get tuple in the same slot on + * next iteration + */ + exec_simple_aggregates(node, scanslot); + } + else + { + /* + * Receive current slot and read up next data row + * message before exiting the loop. Next time when this + * function is invoked we will have either data row + * message ready or EOF + */ + copy_slot(node, scanslot, resultslot); + have_tuple = true; + } } else { @@ -2326,7 +2339,6 @@ ExecRemoteQuery(RemoteQueryState *node) * message ready or EOF */ copy_slot(node, scanslot, resultslot); - (*node->dest->receiveSlot) (resultslot, node->dest); have_tuple = true; } } @@ -2380,10 +2392,7 @@ ExecRemoteQuery(RemoteQueryState *node) { finish_simple_aggregates(node, resultslot); if (!TupIsNull(resultslot)) - { - (*node->dest->receiveSlot) (resultslot, node->dest); have_tuple = true; - } } if (!have_tuple) /* report end of scan */ @@ -2405,12 +2414,234 @@ ExecRemoteQuery(RemoteQueryState *node) void ExecEndRemoteQuery(RemoteQueryState *node) { - (*node->dest->rShutdown) (node->dest); + /* + * Release tuplesort resources + */ + if (node->tuplesortstate != NULL) + tuplesort_end((Tuplesortstate *) node->tuplesortstate); + node->tuplesortstate = NULL; + + /* + * shut down the subplan + */ + if (outerPlanState(node)) + ExecEndNode(outerPlanState(node)); + if (node->tmp_ctx) MemoryContextDelete(node->tmp_ctx); + CloseCombiner(node); } +/* + * Execute utility statement on multiple data nodes + * It does approximately the same as + * + * RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags); + * Assert(TupIsNull(ExecRemoteQuery(state)); + * ExecEndRemoteQuery(state) + * + * But does not need an Estate instance and does not do some unnecessary work, + * like allocating tuple slots. + */ +void +ExecRemoteUtility(RemoteQuery *node) +{ + RemoteQueryState *remotestate; + bool force_autocommit = node->force_autocommit; + bool is_read_only = node->read_only; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot = GetActiveSnapshot(); + DataNodeHandle **connections = NULL; + DataNodeHandle **primaryconnection = NULL; + int regular_conn_count; + int total_conn_count; + bool need_tran; + int i; + + remotestate = CreateResponseCombiner(0, node->combine_type); + + get_exec_connections(node->exec_nodes, + ®ular_conn_count, + &total_conn_count, + &connections, + &primaryconnection); + + if (force_autocommit) + need_tran = false; + else + need_tran = !autocommit || total_conn_count > 1; + + if (!is_read_only) + { + if (primaryconnection) + register_write_nodes(1, primaryconnection); + register_write_nodes(regular_conn_count, connections); + } + + gxid = GetCurrentGlobalTransactionId(); + if (!GlobalTransactionIdIsValid(gxid)) + { + if (primaryconnection) + pfree(primaryconnection); + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + } + + if (need_tran) + { + /* + * Check if data node connections are in transaction and start + * transactions on nodes where it is not started + */ + DataNodeHandle *new_connections[total_conn_count]; + int new_count = 0; + + if (primaryconnection && primaryconnection[0]->transaction_status != 'T') + new_connections[new_count++] = primaryconnection[0]; + for (i = 0; i < regular_conn_count; i++) + if (connections[i]->transaction_status != 'T') + new_connections[new_count++] = connections[i]; + + if (new_count) + data_node_begin(new_count, new_connections, gxid); + } + + /* See if we have a primary nodes, execute on it first before the others */ + if (primaryconnection) + { + /* If explicit transaction is needed gxid is already sent */ + if (!need_tran && data_node_send_gxid(primaryconnection[0], gxid)) + { + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (snapshot && data_node_send_snapshot(primaryconnection[0], snapshot)) + { + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (data_node_send_query(primaryconnection[0], node->sql_statement) != 0) + { + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + Assert(remotestate->combine_type == COMBINE_TYPE_SAME); + + while (remotestate->command_complete_count < 1) + { + PG_TRY(); + { + data_node_receive(1, primaryconnection, NULL); + while (handle_response(primaryconnection[0], remotestate) == RESPONSE_EOF) + data_node_receive(1, primaryconnection, NULL); + if (remotestate->errorMessage) + { + char *code = remotestate->errorCode; + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", remotestate->errorMessage))); + } + } + /* If we got an error response return immediately */ + PG_CATCH(); + { + pfree(primaryconnection); + pfree(connections); + PG_RE_THROW(); + } + PG_END_TRY(); + } + pfree(primaryconnection); + } + + for (i = 0; i < regular_conn_count; i++) + { + /* If explicit transaction is needed gxid is already sent */ + if (!need_tran && data_node_send_gxid(connections[i], gxid)) + { + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (snapshot && data_node_send_snapshot(connections[i], snapshot)) + { + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (data_node_send_query(connections[i], node->sql_statement) != 0) + { + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + } + + /* + * Stop if all commands are completed or we got a data row and + * initialized state node for subsequent invocations + */ + while (regular_conn_count > 0) + { + int i = 0; + + data_node_receive(regular_conn_count, connections, NULL); + /* + * Handle input from the data nodes. + * If we got a RESPONSE_DATAROW we can break handling to wrap + * it into a tuple and return. Handling will be continued upon + * subsequent invocations. + * If we got 0, we exclude connection from the list. We do not + * expect more input from it. In case of non-SELECT query we quit + * the loop when all nodes finish their work and send ReadyForQuery + * with empty connections array. + * If we got EOF, move to the next connection, will receive more + * data on the next iteration. + */ + while (i < regular_conn_count) + { + int res = handle_response(connections[i], remotestate); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_COMPLETE) + { + if (i < --regular_conn_count) + connections[i] = connections[regular_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from data node"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from data node"))); + } + } + } +} + /* * Called when the backend is ending. diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index ea66125..608755f 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -650,6 +650,20 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string, */ querytree_list = pg_rewrite_query(query); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + ListCell *lc; + + foreach(lc, querytree_list) + { + Query *query = (Query *) lfirst(lc); + query->sql_statement = pstrdup(query_string); + query->nodeTag = nodeTag(parsetree); + } + } +#endif + TRACE_POSTGRESQL_QUERY_REWRITE_DONE(query_string); return querytree_list; @@ -900,9 +914,6 @@ exec_simple_query(const char *query_string) DestReceiver *receiver; int16 format; #ifdef PGXC - Query_Plan *query_plan; - RemoteQuery *query_step; - bool exec_on_coord; /* * By default we do not want data nodes to contact GTM directly, @@ -910,9 +921,6 @@ exec_simple_query(const char *query_string) */ if (IS_PGXC_DATANODE) SetForceXidFromGTM(false); - - exec_on_coord = true; - query_plan = NULL; #endif /* @@ -968,131 +976,11 @@ exec_simple_query(const char *query_string) querytree_list = pg_analyze_and_rewrite(parsetree, query_string, NULL, 0); -#ifdef PGXC /* PGXC_COORD */ - if (IS_PGXC_COORDINATOR) - { - if (IsA(parsetree, TransactionStmt)) - pgxc_transaction_stmt(parsetree); - - else if (IsA(parsetree, ExecDirectStmt)) - { - ExecDirectStmt *execdirect = (ExecDirectStmt *) parsetree; - List *inner_parse_tree_list; - - Assert(IS_PGXC_COORDINATOR); - - exec_on_coord = execdirect->coordinator; - - /* - * Switch to appropriate context for constructing parse and - * query trees (these must outlive the execution context). - */ - oldcontext = MemoryContextSwitchTo(MessageContext); - - inner_parse_tree_list = pg_parse_query(execdirect->query); - /* - * we do not support complex commands (expanded to multiple - * parse trees) within EXEC DIRECT - */ - if (list_length(parsetree_list) != 1) - { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Can not execute %s with EXECUTE DIRECT", - execdirect->query))); - } - parsetree = linitial(inner_parse_tree_list); - - /* - * Set up a snapshot if parse analysis/planning will need - * one. - */ - if (analyze_requires_snapshot(parsetree)) - { - PushActiveSnapshot(GetTransactionSnapshot()); - snapshot_set = true; - } - - querytree_list = pg_analyze_and_rewrite(parsetree, - query_string, - NULL, - 0); - - if (execdirect->nodes) - { - ListCell *lc; - Query *query = (Query *) linitial(querytree_list); - - query_plan = (Query_Plan *) palloc0(sizeof(Query_Plan)); - query_step = makeNode(RemoteQuery); - query_step->plan.targetlist = query->targetList; - query_step->sql_statement = pstrdup(execdirect->query); - query_step->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); - foreach (lc, execdirect->nodes) - { - int node = intVal(lfirst(lc)); - query_step->exec_nodes->nodelist = lappend_int(query_step->exec_nodes->nodelist, node); - } - query_step->combine_type = COMBINE_TYPE_SAME; - - query_plan->query_step_list = lappend(NULL, query_step); - query_plan->exec_loc_type = EXEC_ON_DATA_NODES; - } - - /* Restore context */ - MemoryContextSwitchTo(oldcontext); - - } - else if (IsA(parsetree, CopyStmt)) - { - CopyStmt *copy = (CopyStmt *) parsetree; - uint64 processed; - /* Snapshot is needed for the Copy */ - if (!snapshot_set) - { - PushActiveSnapshot(GetTransactionSnapshot()); - snapshot_set = true; - } - /* - * A check on locator is made in DoCopy to determine if the copy can be launched on - * Datanode or on Coordinator. - * If a table has no locator data, then IsCoordPortalCopy returns false and copy is launched - * on Coordinator instead (e.g., using pg_catalog tables). - * If a table has some locator data (user tables), then copy was launched normally - * in Datanodes - */ - if (!IsCoordPortalCopy(copy)) - { - exec_on_coord = false; - processed = DoCopy(copy, query_string, false); - snprintf(completionTag, COMPLETION_TAG_BUFSIZE, - "COPY " UINT64_FORMAT, processed); - } - else - exec_on_coord = true; - } - else - { - query_plan = GetQueryPlan(parsetree, query_string, querytree_list); - - exec_on_coord = query_plan-... [truncated message content] |
From: mason_s <ma...@us...> - 2010-08-05 19:00:31
|
Project "Postgres-XC". The branch, master has been updated via fbaab7cc05f975cd6339918390fd22360744b08c (commit) from c7476b9cf075aba2dd2ed11ea57c632c1ad6721a (commit) - Log ----------------------------------------------------------------- commit fbaab7cc05f975cd6339918390fd22360744b08c Author: Mason S <mas...@ma...> Date: Thu Aug 5 14:55:55 2010 -0400 There is a race condition that could lead to problems for the CLOG and sub transactions. In Postgres-XC, multiple processes may decide to extend the CLOG at the same time. One will wait for the other, then afterwards re-zero out the page. Instead, once the lock is obtained, we re-check to make sure that another process did not already extend and create the page. If so, we just exit. diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 1028aae..2a0f245 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -590,6 +590,9 @@ ExtendCLOG(TransactionId newestXact) /* * The first condition makes sure we did not wrap around * The second checks if we are still using the same page + * Note that this value can change and we are not holding a lock, + * so we repeat the check below. We do it this way instead of + * grabbing the lock to avoid lock contention. */ if (ClogCtl->shared->latest_page_number - pageno <= CLOG_WRAP_CHECK_DELTA && pageno <= ClogCtl->shared->latest_page_number) @@ -604,6 +607,20 @@ ExtendCLOG(TransactionId newestXact) LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); +#ifdef PGXC + /* + * We repeat the check. Another process may have written + * out the page already and advanced the latest_page_number + * while we were waiting for the lock. + */ + if (ClogCtl->shared->latest_page_number - pageno <= CLOG_WRAP_CHECK_DELTA + && pageno <= ClogCtl->shared->latest_page_number) + { + LWLockRelease(CLogControlLock); + return; + } +#endif + /* Zero the page and make an XLOG entry about it */ ZeroCLOGPage(pageno, true); diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index e3c9a64..35c9d83 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -294,7 +294,6 @@ CheckPointSUBTRANS(void) TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); } - /* * Make sure that SUBTRANS has room for a newly-allocated XID. * @@ -325,7 +324,10 @@ ExtendSUBTRANS(TransactionId newestXact) /* * The first condition makes sure we did not wrap around - * The second checks if we are still using the same page + * The second checks if we are still using the same page. + * Note that this value can change and we are not holding a lock, + * so we repeat the check below. We do it this way instead of + * grabbing the lock to avoid lock contention. */ if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA && pageno <= SubTransCtl->shared->latest_page_number) @@ -340,6 +342,20 @@ ExtendSUBTRANS(TransactionId newestXact) LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); +#ifdef PGXC + /* + * We repeat the check. Another process may have written + * out the page already and advanced the latest_page_number + * while we were waiting for the lock. + */ + if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA + && pageno <= SubTransCtl->shared->latest_page_number) + { + LWLockRelease(SubtransControlLock); + return; + } +#endif + /* Zero the page */ ZeroSUBTRANSPage(pageno); ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/clog.c | 17 +++++++++++++++++ src/backend/access/transam/subtrans.c | 20 ++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-08-05 18:49:16
|
Project "Postgres-XC". The branch, master has been updated via c7476b9cf075aba2dd2ed11ea57c632c1ad6721a (commit) from 086c5c6be32d4ca9232523cd64caf6d29aaac42c (commit) - Log ----------------------------------------------------------------- commit c7476b9cf075aba2dd2ed11ea57c632c1ad6721a Author: Mason S <mas...@ma...> Date: Thu Aug 5 14:36:37 2010 -0400 Added more handling to deal with data node connection failures. This includes forcing the release of connections in an unexpected state and bug fixes. This was written by Andrei Martsinchyk, with some additional handling added by Mason. diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c index e390f8a..82eca8c 100644 --- a/contrib/pgbench/pgbench.c +++ b/contrib/pgbench/pgbench.c @@ -205,9 +205,9 @@ static char *tpc_b_bid = { "\\setrandom tid 1 :ntellers\n" "\\setrandom delta -5000 5000\n" "BEGIN;\n" - "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid AND bid = :bid;\n" - "SELECT abalance FROM pgbench_accounts WHERE aid = :aid AND bid = :bid\n" - "UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid AND bid = :bid;\n" + "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" + "SELECT abalance FROM pgbench_accounts WHERE aid = :aid\n" + "UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;\n" "UPDATE pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;\n" "INSERT INTO pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" "END;\n" diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 491d0d5..673aad1 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -135,7 +135,7 @@ typedef struct TransactionStateData { TransactionId transactionId; /* my XID, or Invalid if none */ #ifdef PGXC /* PGXC_COORD */ - GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */ + GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */ #endif SubTransactionId subTransactionId; /* my subxact ID */ char *name; /* savepoint name, if any */ @@ -314,7 +314,7 @@ GetCurrentGlobalTransactionId(void) * GetGlobalTransactionId * * This will return the GXID of the specified transaction, - * getting one from the GTM if it's not yet set. + * getting one from the GTM if it's not yet set. */ static GlobalTransactionId GetGlobalTransactionId(TransactionState s) @@ -469,7 +469,7 @@ AssignTransactionId(TransactionState s) if (IS_PGXC_COORDINATOR) { s->transactionId = (TransactionId) GetGlobalTransactionId(s); - elog(DEBUG1, "New transaction id assigned = %d, isSubXact = %s", + elog(DEBUG1, "New transaction id assigned = %d, isSubXact = %s", s->transactionId, isSubXact ? "true" : "false"); } else @@ -1679,6 +1679,14 @@ CommitTransaction(void) */ AtEOXact_UpdateFlatFiles(true); +#ifdef PGXC + /* + * There can be error on the data nodes. So go to data nodes before + * changing transaction state and local clean up + */ + DataNodeCommit(); +#endif + /* Prevent cancel/die interrupt while cleaning up */ HOLD_INTERRUPTS(); @@ -1694,13 +1702,13 @@ CommitTransaction(void) latestXid = RecordTransactionCommit(); TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid); + #ifdef PGXC + /* + * Now we can let GTM know about transaction commit + */ if (IS_PGXC_COORDINATOR) { - /* Make sure this committed on the DataNodes, - * if so it will just return - */ - DataNodeCommit(DestNone); CommitTranGTM(s->globalTransactionId); latestXid = s->globalTransactionId; } @@ -1712,7 +1720,7 @@ CommitTransaction(void) CommitTranGTM((GlobalTransactionId) latestXid); } #endif - + /* * Let others know about no transaction in progress by me. Note that this * must be done _before_ releasing locks we hold and _after_ @@ -1808,7 +1816,7 @@ CommitTransaction(void) s->nChildXids = 0; s->maxChildXids = 0; -#ifdef PGXC +#ifdef PGXC if (IS_PGXC_COORDINATOR) s->globalTransactionId = InvalidGlobalTransactionId; else if (IS_PGXC_DATANODE) @@ -2143,10 +2151,10 @@ AbortTransaction(void) #ifdef PGXC if (IS_PGXC_COORDINATOR) { - /* Make sure this is rolled back on the DataNodes, - * if so it will just return + /* Make sure this is rolled back on the DataNodes, + * if so it will just return */ - DataNodeRollback(DestNone); + DataNodeRollback(); RollbackTranGTM(s->globalTransactionId); latestXid = s->globalTransactionId; } diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c index 517b1e4..0f4072d 100644 --- a/src/backend/pgxc/pool/datanode.c +++ b/src/backend/pgxc/pool/datanode.c @@ -199,6 +199,9 @@ data_node_init(DataNodeHandle *handle, int sock, int nodenum) handle->sock = sock; handle->transaction_status = 'I'; handle->state = DN_CONNECTION_STATE_IDLE; +#ifdef DN_CONNECTION_DEBUG + handle->have_row_desc = false; +#endif handle->error = NULL; handle->outEnd = 0; handle->inStart = 0; @@ -211,7 +214,7 @@ data_node_init(DataNodeHandle *handle, int sock, int nodenum) * Wait while at least one of specified connections has data available and read * the data into the buffer */ -void +int data_node_receive(const int conn_count, DataNodeHandle ** connections, struct timeval * timeout) { @@ -239,7 +242,7 @@ data_node_receive(const int conn_count, * Return if we do not have connections to receive input */ if (nfds == 0) - return; + return 0; retry: res_select = select(nfds + 1, &readfds, NULL, NULL, timeout); @@ -249,27 +252,19 @@ retry: if (errno == EINTR || errno == EAGAIN) goto retry; - /* - * PGXCTODO - we may want to close the connections and notify the - * pooler that these are invalid. - */ if (errno == EBADF) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("select() bad file descriptor set"))); + elog(WARNING, "select() bad file descriptor set"); } - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("select() error: %d", errno))); + elog(WARNING, "select() error: %d", errno); + return errno; } if (res_select == 0) { /* Handle timeout */ - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("timeout while waiting for response"))); + elog(WARNING, "timeout while waiting for response"); + return EOF; } /* read data */ @@ -283,10 +278,9 @@ retry: if (read_status == EOF || read_status < 0) { - /* PGXCTODO - we should notify the pooler to destroy the connections */ - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("unexpected EOF on datanode connection"))); + add_error_message(conn, "unexpected EOF on datanode connection"); + elog(WARNING, "unexpected EOF on datanode connection"); + return EOF; } else { @@ -294,6 +288,7 @@ retry: } } } + return 0; } @@ -522,7 +517,7 @@ get_message(DataNodeHandle *conn, int *len, char **msg) * ensure_in_buffer_capacity() will immediately return */ ensure_in_buffer_capacity(5 + (size_t) *len, conn); - conn->state == DN_CONNECTION_STATE_QUERY; + conn->state = DN_CONNECTION_STATE_QUERY; conn->inCursor = conn->inStart; return '\0'; } @@ -539,19 +534,27 @@ void release_handles(void) { int i; + int discard[NumDataNodes]; + int ndisc = 0; if (node_count == 0) return; - PoolManagerReleaseConnections(); for (i = 0; i < NumDataNodes; i++) { DataNodeHandle *handle = &handles[i]; if (handle->sock != NO_SOCKET) + { + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + elog(WARNING, "Connection to data node %d has unexpected state %d and will be dropped", handle->nodenum, handle->state); + discard[ndisc++] = handle->nodenum; + } data_node_free(handle); + } } - + PoolManagerReleaseConnections(ndisc, discard); node_count = 0; } @@ -897,7 +900,7 @@ void add_error_message(DataNodeHandle *handle, const char *message) { handle->transaction_status = 'E'; - handle->state = DN_CONNECTION_STATE_IDLE; + handle->state = DN_CONNECTION_STATE_ERROR_NOT_READY; if (handle->error) { /* PGXCTODO append */ diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c6f9042..bbedef0 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -24,6 +24,7 @@ #include "miscadmin.h" #include "pgxc/execRemote.h" #include "pgxc/poolmgr.h" +#include "storage/ipc.h" #include "utils/datum.h" #include "utils/memutils.h" #include "utils/tuplesort.h" @@ -40,9 +41,10 @@ static bool autocommit = true; static DataNodeHandle **write_node_list = NULL; static int write_node_count = 0; -static int data_node_begin(int conn_count, DataNodeHandle ** connections, CommandDest dest, GlobalTransactionId gxid); -static int data_node_commit(int conn_count, DataNodeHandle ** connections, CommandDest dest); -static int data_node_rollback(int conn_count, DataNodeHandle ** connections, CommandDest dest); +static int data_node_begin(int conn_count, DataNodeHandle ** connections, + GlobalTransactionId gxid); +static int data_node_commit(int conn_count, DataNodeHandle ** connections); +static int data_node_rollback(int conn_count, DataNodeHandle ** connections); static void clear_write_node_list(); @@ -920,7 +922,7 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) /* * Handle responses from the Data node connections */ -static void +static int data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, struct timeval * timeout, RemoteQueryState *combiner) { @@ -940,7 +942,8 @@ data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, { int i = 0; - data_node_receive(count, to_receive, timeout); + if (data_node_receive(count, to_receive, timeout)) + return EOF; while (i < count) { int result = handle_response(to_receive[i], combiner); @@ -959,12 +962,17 @@ data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, break; default: /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from the data nodes, result = %d, request type %d", result, combiner->request_type))); + add_error_message(to_receive[i], "Unexpected response from the data nodes"); + elog(WARNING, "Unexpected response from the data nodes, result = %d, request type %d", result, combiner->request_type); + /* Stop tracking and move last connection in place */ + count--; + if (i < count) + to_receive[i] = to_receive[count]; } } } + + return 0; } /* @@ -990,6 +998,18 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) if (conn->state == DN_CONNECTION_STATE_QUERY) return RESPONSE_EOF; + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + return RESPONSE_EOF; + } + /* TODO handle other possible responses */ switch (get_message(conn, &msg_len, &msg)) { @@ -1005,10 +1025,17 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) HandleCommandComplete(combiner, msg, msg_len); break; case 'T': /* RowDescription */ +#ifdef DN_CONNECTION_DEBUG + Assert(!conn->have_row_desc); + conn->have_row_desc = true; +#endif if (HandleRowDescription(combiner, msg, msg_len)) return RESPONSE_TUPDESC; break; case 'D': /* DataRow */ +#ifdef DN_CONNECTION_DEBUG + Assert(conn->have_row_desc); +#endif HandleDataRow(combiner, msg, msg_len); return RESPONSE_DATAROW; case 'G': /* CopyInResponse */ @@ -1042,6 +1069,9 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) case 'Z': /* ReadyForQuery */ conn->transaction_status = msg[0]; conn->state = DN_CONNECTION_STATE_IDLE; +#ifdef DN_CONNECTION_DEBUG + conn->have_row_desc = false; +#endif return RESPONSE_COMPLETE; case 'I': /* EmptyQuery */ default: @@ -1058,7 +1088,8 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) * Send BEGIN command to the Data nodes and receive responses */ static int -data_node_begin(int conn_count, DataNodeHandle ** connections, CommandDest dest, GlobalTransactionId gxid) +data_node_begin(int conn_count, DataNodeHandle ** connections, + GlobalTransactionId gxid) { int i; struct timeval *timeout = NULL; @@ -1078,7 +1109,8 @@ data_node_begin(int conn_count, DataNodeHandle ** connections, CommandDest dest, combiner->dest = None_Receiver; /* Receive responses */ - data_node_receive_responses(conn_count, connections, timeout, combiner); + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; /* Verify status */ return ValidateAndCloseCombiner(combiner) ? 0 : EOF; @@ -1109,12 +1141,12 @@ DataNodeBegin(void) /* - * Commit current transaction, use two-phase commit if necessary + * Commit current transaction on data nodes where it has been started */ -int -DataNodeCommit(CommandDest dest) +void +DataNodeCommit(void) { - int res; + int res = 0; int tran_count; DataNodeHandle *connections[NumDataNodes]; @@ -1128,7 +1160,7 @@ DataNodeCommit(CommandDest dest) if (tran_count == 0) goto finish; - res = data_node_commit(tran_count, connections, dest); + res = data_node_commit(tran_count, connections); finish: /* In autocommit mode statistics is collected in DataNodeExec */ @@ -1138,15 +1170,19 @@ finish: release_handles(); autocommit = true; clear_write_node_list(); - return res; + if (res != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not commit connection on data nodes"))); } /* - * Send COMMIT or PREPARE/COMMIT PREPARED down to the Data nodes and handle responses + * Commit transaction on specified data node connections, use two-phase commit + * if more then on one node data have been modified during the transactioon. */ static int -data_node_commit(int conn_count, DataNodeHandle ** connections, CommandDest dest) +data_node_commit(int conn_count, DataNodeHandle ** connections) { int i; struct timeval *timeout = NULL; @@ -1191,13 +1227,12 @@ data_node_commit(int conn_count, DataNodeHandle ** connections, CommandDest dest combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); combiner->dest = None_Receiver; /* Receive responses */ - data_node_receive_responses(conn_count, connections, timeout, combiner); + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; /* Reset combiner */ if (!ValidateAndResetCombiner(combiner)) - { result = EOF; - } } if (!do2PC) @@ -1238,7 +1273,8 @@ data_node_commit(int conn_count, DataNodeHandle ** connections, CommandDest dest combiner->dest = None_Receiver; } /* Receive responses */ - data_node_receive_responses(conn_count, connections, timeout, combiner); + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; result = ValidateAndCloseCombiner(combiner) ? result : EOF; finish: @@ -1253,7 +1289,7 @@ finish: * Rollback current transaction */ int -DataNodeRollback(CommandDest dest) +DataNodeRollback(void) { int res = 0; int tran_count; @@ -1269,7 +1305,7 @@ DataNodeRollback(CommandDest dest) if (tran_count == 0) goto finish; - res = data_node_rollback(tran_count, connections, dest); + res = data_node_rollback(tran_count, connections); finish: /* In autocommit mode statistics is collected in DataNodeExec */ @@ -1287,24 +1323,23 @@ finish: * Send ROLLBACK command down to the Data nodes and handle responses */ static int -data_node_rollback(int conn_count, DataNodeHandle ** connections, CommandDest dest) +data_node_rollback(int conn_count, DataNodeHandle ** connections) { int i; struct timeval *timeout = NULL; - int result = 0; RemoteQueryState *combiner; /* Send ROLLBACK - */ for (i = 0; i < conn_count; i++) { - if (data_node_send_query(connections[i], "ROLLBACK")) - result = EOF; + data_node_send_query(connections[i], "ROLLBACK"); } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); combiner->dest = None_Receiver; /* Receive responses */ - data_node_receive_responses(conn_count, connections, timeout, combiner); + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; /* Verify status */ return ValidateAndCloseCombiner(combiner) ? 0 : EOF; @@ -1404,7 +1439,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ if (new_count > 0 && need_tran) { /* Start transaction on connections where it is not started */ - if (data_node_begin(new_count, newConnections, DestNone, gxid)) + if (data_node_begin(new_count, newConnections, gxid)) { pfree(connections); pfree(copy_connections); @@ -1448,8 +1483,8 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ combiner->dest = None_Receiver; /* Receive responses */ - data_node_receive_responses(conn_count, connections, timeout, combiner); - if (!ValidateAndCloseCombiner(combiner)) + if (data_node_receive_responses(conn_count, connections, timeout, combiner) + || !ValidateAndCloseCombiner(combiner)) { if (autocommit) { @@ -1665,6 +1700,12 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("unexpected EOF on datanode connection"))); + else + /* + * Set proper connection status - handle_response + * has changed it to DN_CONNECTION_STATE_QUERY + */ + handle->state = DN_CONNECTION_STATE_COPY_OUT; } /* There is no more data that can be read from connection */ } @@ -1746,7 +1787,7 @@ DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, combiner = CreateResponseCombiner(conn_count + 1, combine_type); combiner->dest = None_Receiver; - data_node_receive_responses(1, &primary_handle, timeout, combiner); + error = data_node_receive_responses(1, &primary_handle, timeout, combiner) || error; } for (i = 0; i < conn_count; i++) @@ -1786,30 +1827,14 @@ DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, combiner = CreateResponseCombiner(conn_count, combine_type); combiner->dest = None_Receiver; } - data_node_receive_responses(conn_count, connections, timeout, combiner); + error = (data_node_receive_responses(conn_count, connections, timeout, combiner) != 0) || error; processed = combiner->row_count; if (!ValidateAndCloseCombiner(combiner) || error) - { - if (autocommit) - { - if (need_tran) - DataNodeRollback(DestNone); - else - if (!PersistentConnections) release_handles(); - } - - return 0; - } - - if (autocommit) - { - if (need_tran) - DataNodeCommit(DestNone); - else - if (!PersistentConnections) release_handles(); - } + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Error while running COPY"))); return processed; } @@ -1882,9 +1907,17 @@ copy_slot(RemoteQueryState *node, TupleTableSlot *src, TupleTableSlot *dst) else { int i; + + /* + * Data node may be sending junk columns which are always at the end, + * but it must not be shorter then result slot. + */ + Assert(dst->tts_tupleDescriptor->natts <= src->tts_tupleDescriptor->natts); ExecClearTuple(dst); slot_getallattrs(src); - /* PGXCTODO revisit: probably incorrect */ + /* + * PGXCTODO revisit: if it is correct to copy Datums using assignment? + */ for (i = 0; i < dst->tts_tupleDescriptor->natts; i++) { dst->tts_values[i] = src->tts_values[i]; @@ -1911,6 +1944,8 @@ ExecRemoteQuery(RemoteQueryState *node) EState *estate = node->ss.ps.state; TupleTableSlot *resultslot = node->ss.ps.ps_ResultTupleSlot; TupleTableSlot *scanslot = node->ss.ss_ScanTupleSlot; + bool have_tuple = false; + if (!node->query_Done) { @@ -1974,7 +2009,7 @@ ExecRemoteQuery(RemoteQueryState *node) else need_tran = !autocommit || total_conn_count > 1; - elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, need_tran = %s", autocommit ? "true" : "false", primarynode ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); + elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, statement_need_tran = %s", autocommit ? "true" : "false", primarynode ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); stat_statement(); if (autocommit) @@ -2052,7 +2087,7 @@ ExecRemoteQuery(RemoteQueryState *node) new_connections[new_count++] = connections[i]; if (new_count) - data_node_begin(new_count, new_connections, DestNone, gxid); + data_node_begin(new_count, new_connections, gxid); } /* See if we have a primary nodes, execute on it first before the others */ @@ -2088,36 +2123,22 @@ ExecRemoteQuery(RemoteQueryState *node) while (node->command_complete_count < 1) { - PG_TRY(); - { - data_node_receive(1, primaryconnection, NULL); - while (handle_response(primaryconnection[0], node) == RESPONSE_EOF) - data_node_receive(1, primaryconnection, NULL); - if (node->errorMessage) - { - char *code = node->errorCode; + if (data_node_receive(1, primaryconnection, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes"))); + while (handle_response(primaryconnection[0], node) == RESPONSE_EOF) + if (data_node_receive(1, primaryconnection, NULL)) ereport(ERROR, - (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), - errmsg("%s", node->errorMessage))); - } - } - /* If we got an error response return immediately */ - PG_CATCH(); + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes"))); + if (node->errorMessage) { - /* We are going to exit, so release combiner */ - if (autocommit) - { - if (need_tran) - DataNodeRollback(DestNone); - else if (!PersistentConnections) - release_handles(); - } - - pfree(primaryconnection); - pfree(connections); - PG_RE_THROW(); + char *code = node->errorCode; + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", node->errorMessage))); } - PG_END_TRY(); } pfree(primaryconnection); } @@ -2148,8 +2169,6 @@ ExecRemoteQuery(RemoteQueryState *node) } } - PG_TRY(); - { /* * Stop if all commands are completed or we got a data row and * initialized state node for subsequent invocations @@ -2158,7 +2177,10 @@ ExecRemoteQuery(RemoteQueryState *node) { int i = 0; - data_node_receive(regular_conn_count, connections, NULL); + if (data_node_receive(regular_conn_count, connections, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes"))); /* * Handle input from the data nodes. * If we got a RESPONSE_DATAROW we can break handling to wrap @@ -2234,185 +2256,148 @@ ExecRemoteQuery(RemoteQueryState *node) } } } - } - /* If we got an error response return immediately */ - PG_CATCH(); - { - /* We are going to exit, so release combiner */ - if (autocommit) - { - if (need_tran) - DataNodeRollback(DestNone); - else if (!PersistentConnections) - release_handles(); - } - PG_RE_THROW(); - } - PG_END_TRY(); + node->query_Done = true; - node->need_tran = need_tran; } - PG_TRY(); + if (node->tuplesortstate) { - bool have_tuple = false; - - if (node->tuplesortstate) + while (tuplesort_gettupleslot((Tuplesortstate *) node->tuplesortstate, + true, scanslot)) { - while (tuplesort_gettupleslot((Tuplesortstate *) node->tuplesortstate, - true, scanslot)) + have_tuple = true; + /* + * If DISTINCT is specified and current tuple matches to + * previous skip it and get next one. + * Othervise return current tuple + */ + if (step->distinct) { - have_tuple = true; /* - * If DISTINCT is specified and current tuple matches to - * previous skip it and get next one. - * Othervise return current tuple + * Always receive very first tuple and + * skip to next if scan slot match to previous (result slot) */ - if (step->distinct) + if (!TupIsNull(resultslot) && + execTuplesMatch(scanslot, + resultslot, + step->distinct->numCols, + step->distinct->uniqColIdx, + node->eqfunctions, + node->tmp_ctx)) { - /* - * Always receive very first tuple and - * skip to next if scan slot match to previous (result slot) - */ - if (!TupIsNull(resultslot) && - execTuplesMatch(scanslot, - resultslot, - step->distinct->numCols, - step->distinct->uniqColIdx, - node->eqfunctions, - node->tmp_ctx)) - { - have_tuple = false; - continue; - } + have_tuple = false; + continue; } - copy_slot(node, scanslot, resultslot); - (*node->dest->receiveSlot) (resultslot, node->dest); - break; } - if (!have_tuple) - ExecClearTuple(resultslot); + copy_slot(node, scanslot, resultslot); + (*node->dest->receiveSlot) (resultslot, node->dest); + break; } - else + if (!have_tuple) + ExecClearTuple(resultslot); + } + else + { + while (node->conn_count > 0 && !have_tuple) { - while (node->conn_count > 0 && !have_tuple) - { - int i; + int i; - /* - * If combiner already has tuple go ahead and return it - * otherwise tuple will be cleared - */ - if (FetchTuple(node, scanslot) && !TupIsNull(scanslot)) + /* + * If combiner already has tuple go ahead and return it + * otherwise tuple will be cleared + */ + if (FetchTuple(node, scanslot) && !TupIsNull(scanslot)) + { + if (node->simple_aggregates) { - if (node->simple_aggregates) - { - /* - * Advance aggregate functions and allow to read up next - * data row message and get tuple in the same slot on - * next iteration - */ - exec_simple_aggregates(node, scanslot); - } - else - { - /* - * Receive current slot and read up next data row - * message before exiting the loop. Next time when this - * function is invoked we will have either data row - * message ready or EOF - */ - copy_slot(node, scanslot, resultslot); - (*node->dest->receiveSlot) (resultslot, node->dest); - have_tuple = true; - } + /* + * Advance aggregate functions and allow to read up next + * data row message and get tuple in the same slot on + * next iteration + */ + exec_simple_aggregates(node, scanslot); } - - /* - * Handle input to get next row or ensure command is completed, - * starting from connection next after current. If connection - * does not - */ - if ((i = node->current_conn + 1) == node->conn_count) - i = 0; - - for (;;) + else { - int res = handle_response(node->connections[i], node); - if (res == RESPONSE_EOF) - { - /* go to next connection */ - if (++i == node->conn_count) - i = 0; - /* if we cycled over all connections we need to receive more */ - if (i == node->current_conn) - data_node_receive(node->conn_count, node->connections, NULL); - } - else if (res == RESPONSE_COMPLETE) - { - if (--node->conn_count == 0) - break; - if (i == node->conn_count) - i = 0; - else - node->connections[i] = node->connections[node->conn_count]; - if (node->current_conn == node->conn_count) - node->current_conn = i; - } - else if (res == RESPONSE_DATAROW) - { - node->current_conn = i; - break; - } + /* + * Receive current slot and read up next data row + * message before exiting the loop. Next time when this + * function is invoked we will have either data row + * message ready or EOF + */ + copy_slot(node, scanslot, resultslot); + (*node->dest->receiveSlot) (resultslot, node->dest); + have_tuple = true; } } /* - * We may need to finalize aggregates + * Handle input to get next row or ensure command is completed, + * starting from connection next after current. If connection + * does not */ - if (!have_tuple && node->simple_aggregates) + if ((i = node->current_conn + 1) == node->conn_count) + i = 0; + + for (;;) { - finish_simple_aggregates(node, resultslot); - if (!TupIsNull(resultslot)) + int res = handle_response(node->connections[i], node); + if (res == RESPONSE_EOF) { - (*node->dest->receiveSlot) (resultslot, node->dest); - have_tuple = true; + /* go to next connection */ + if (++i == node->conn_count) + i = 0; + /* if we cycled over all connections we need to receive more */ + if (i == node->current_conn) + if (data_node_receive(node->conn_count, node->connections, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes"))); + } + else if (res == RESPONSE_COMPLETE) + { + if (--node->conn_count == 0) + break; + if (i == node->conn_count) + i = 0; + else + node->connections[i] = node->connections[node->conn_count]; + if (node->current_conn == node->conn_count) + node->current_conn = i; + } + else if (res == RESPONSE_DATAROW) + { + node->current_conn = i; + break; } } - - if (!have_tuple) /* report end of scan */ - ExecClearTuple(resultslot); - } - if (node->errorMessage) + /* + * We may need to finalize aggregates + */ + if (!have_tuple && node->simple_aggregates) { - char *code = node->errorCode; - ereport(ERROR, - (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), - errmsg("%s", node->errorMessage))); + finish_simple_aggregates(node, resultslot); + if (!TupIsNull(resultslot)) + { + (*node->dest->receiveSlot) (resultslot, node->dest); + have_tuple = true; + } } - /* - * If command is completed we should commit work. - */ - if (node->conn_count == 0 && autocommit && node->need_tran) - DataNodeCommit(DestNone); + if (!have_tuple) /* report end of scan */ + ExecClearTuple(resultslot); + } - /* If we got an error response return immediately */ - PG_CATCH(); + + if (node->errorMessage) { - /* We are going to exit, so release combiner */ - if (autocommit) - { - if (node->need_tran) - DataNodeRollback(DestNone); - else if (!PersistentConnections) - release_handles(); - } - PG_RE_THROW(); + char *code = node->errorCode; + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", node->errorMessage))); } - PG_END_TRY(); return resultslot; } @@ -2436,7 +2421,7 @@ DataNodeCleanAndRelease(int code, Datum arg) /* Rollback on Data Nodes */ if (IsTransactionState()) { - DataNodeRollback(DestNone); + DataNodeRollback(); /* Rollback on GTM if transaction id opened. */ RollbackTranGTM((GlobalTransactionId) GetCurrentTransactionIdIfAny()); diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index 4625261..7e4771c 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -22,7 +22,9 @@ #include <errno.h> #include <stddef.h> #include "c.h" +#include "postgres.h" #include "pgxc/poolcomm.h" +#include "storage/ipc.h" #include "utils/elog.h" #include "miscadmin.h" @@ -408,9 +410,16 @@ pool_flush(PoolPort *port) if (errno != last_reported_send_errno) { last_reported_send_errno = errno; - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("could not send data to client: %m"))); + + /* + * Handle a seg fault that may later occur in proc array + * when this fails when we are already shutting down + * If shutting down already, do not call. + */ + if (!proc_exit_inprogress) + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("could not send data to client: %m"))); } /* diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 6427da3..dbb8aed 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -93,7 +93,7 @@ static DatabasePool *find_database_pool(const char *database); static DatabasePool *remove_database_pool(const char *database); static int *agent_acquire_connections(PoolAgent *agent, List *nodelist); static DataNodePoolSlot *acquire_connection(DatabasePool *dbPool, int node); -static void agent_release_connections(PoolAgent *agent, bool clean); +static void agent_release_connections(PoolAgent *agent, List *discard); static void release_connection(DatabasePool *dbPool, DataNodePoolSlot *slot, int index, bool clean); static void destroy_slot(DataNodePoolSlot *slot); static void grow_pool(DatabasePool *dbPool, int index); @@ -587,7 +587,7 @@ agent_init(PoolAgent *agent, const char *database, List *nodes) /* disconnect if we still connected */ if (agent->pool) - agent_release_connections(agent, false); + agent_release_connections(agent, NULL); /* find database */ agent->pool = find_database_pool(database); @@ -612,7 +612,7 @@ agent_destroy(PoolAgent *agent) /* Discard connections if any remaining */ if (agent->pool) - agent_release_connections(agent, false); + agent_release_connections(agent, NULL); /* find agent in the list */ for (i = 0; i < agentCount; i++) @@ -700,11 +700,6 @@ static void agent_handle_input(PoolAgent * agent, StringInfo s) { int qtype; - const char *database; - int nodecount; - List *nodelist = NIL; - int *fds; - int i; qtype = pool_getbyte(&agent->port); /* @@ -712,6 +707,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s) */ for (;;) { + const char *database; + int nodecount; + List *nodelist = NIL; + int *fds; + int i; + switch (qtype) { case 'c': /* CONNECT */ @@ -729,9 +730,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) pool_getmessage(&agent->port, s, 4 * NumDataNodes + 8); nodecount = pq_getmsgint(s, 4); for (i = 0; i < nodecount; i++) - { nodelist = lappend_int(nodelist, pq_getmsgint(s, 4)); - } pq_getmsgend(s); /* * In case of error agent_acquire_connections will log @@ -744,9 +743,13 @@ agent_handle_input(PoolAgent * agent, StringInfo s) pfree(fds); break; case 'r': /* RELEASE CONNECTIONS */ - pool_getmessage(&agent->port, s, 4); + pool_getmessage(&agent->port, s, 4 * NumDataNodes + 8); + nodecount = pq_getmsgint(s, 4); + for (i = 0; i < nodecount; i++) + nodelist = lappend_int(nodelist, pq_getmsgint(s, 4)); pq_getmsgend(s); - agent_release_connections(agent, true); + agent_release_connections(agent, nodelist); + list_free(nodelist); break; default: /* EOF or protocol violation */ agent_destroy(agent); @@ -831,11 +834,24 @@ agent_acquire_connections(PoolAgent *agent, List *nodelist) * Retun connections back to the pool */ void -PoolManagerReleaseConnections(void) +PoolManagerReleaseConnections(int ndisc, int* discard) { + uint32 n32; + uint32 buf[1 + ndisc]; + int i; + Assert(Handle); - pool_putmessage(&Handle->port, 'r', NULL, 0); + n32 = htonl((uint32) ndisc); + buf[0] = n32; + + for (i = 0; i < ndisc;) + { + n32 = htonl((uint32) discard[i++]); + buf[i] = n32; + } + pool_putmessage(&Handle->port, 'r', (char *) buf, + (1 + ndisc) * sizeof(uint32)); pool_flush(&Handle->port); } @@ -844,23 +860,40 @@ PoolManagerReleaseConnections(void) * Release connections */ static void -agent_release_connections(PoolAgent *agent, bool clean) +agent_release_connections(PoolAgent *agent, List *discard) { int i; + DataNodePoolSlot *slot; + if (!agent->connections) return; - /* Enumerate connections */ - for (i = 0; i < NumDataNodes; i++) + if (discard) { - DataNodePoolSlot *slot; + ListCell *lc; + foreach(lc, discard) + { + int node = lfirst_int(lc); + Assert(node > 0 && node <= NumDataNodes); + slot = agent->connections[node - 1]; + + /* Discard connection */ + if (slot) + release_connection(agent->pool, slot, node - 1, false); + agent->connections[node - 1] = NULL; + } + } + + /* Remaining connections are assumed to be clean */ + for (i = 0; i < NumDataNodes; i++) + { slot = agent->connections[i]; /* Release connection */ if (slot) - release_connection(agent->pool, slot, i, clean); + release_connection(agent->pool, slot, i, true); agent->connections[i] = NULL; } } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 553a682..ea66125 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4393,11 +4393,11 @@ pgxc_transaction_stmt (Node *parsetree) break; case TRANS_STMT_COMMIT: - DataNodeCommit(DestNone); + DataNodeCommit(); break; case TRANS_STMT_ROLLBACK: - DataNodeRollback(DestNone); + DataNodeRollback(); break; default: diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 077a589..75f1f41 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -2875,12 +2875,16 @@ reversedirection_heap(Tuplesortstate *state) static unsigned int getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK) { + DataNodeHandle *conn = state->combiner->connections[tapenum]; for (;;) { - switch (handle_response(state->combiner->connections[tapenum], state->combiner)) + switch (handle_response(conn, state->combiner)) { case RESPONSE_EOF: - data_node_receive(1, state->combiner->connections + tapenum, NULL); + if (data_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg(conn->error))); break; case RESPONSE_COMPLETE: if (eofOK) diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h index ab95022..849d84a 100644 --- a/src/include/pgxc/datanode.h +++ b/src/include/pgxc/datanode.h @@ -50,6 +50,9 @@ struct data_node_handle /* Connection state */ char transaction_status; DNConnectionState state; +#ifdef DN_CONNECTION_DEBUG + bool have_row_desc; +#endif char *error; /* Output buffer */ char *outBuffer; @@ -86,7 +89,7 @@ extern int data_node_send_query(DataNodeHandle * handle, const char *query); extern int data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid); extern int data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot); -extern void data_node_receive(const int conn_count, +extern int data_node_receive(const int conn_count, DataNodeHandle ** connections, struct timeval * timeout); extern int data_node_read_data(DataNodeHandle * conn); extern int send_some(DataNodeHandle * handle, int len); diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index d99806a..e9b59cc 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -62,7 +62,6 @@ typedef struct RemoteQueryState char errorCode[5]; /* error code to send back to client */ char *errorMessage; /* error message to send back to client */ bool query_Done; /* query has been sent down to data nodes */ - bool need_tran; /* auto commit on nodes after completion */ char *completionTag; /* completion tag to present to caller */ char *msg; /* last data row message */ int msglen; /* length of the data row message */ @@ -81,8 +80,8 @@ typedef struct RemoteQueryState /* Multinode Executor */ extern void DataNodeBegin(void); -extern int DataNodeCommit(CommandDest dest); -extern int DataNodeRollback(CommandDest dest); +extern void DataNodeCommit(void); +extern int DataNodeRollback(void); extern DataNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_from); extern int DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections); diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 2c9128e..b7ac3ae 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -45,7 +45,7 @@ typedef struct char *connstr; int freeSize; /* available connections */ int size; /* total pool size */ - DataNodePoolSlot **slot; + DataNodePoolSlot **slot; } DataNodePool; /* All pools for specified database */ @@ -57,7 +57,7 @@ typedef struct databasepool struct databasepool *next; } DatabasePool; -/* Agent of client session (Pool Manager side) +/* Agent of client session (Pool Manager side) * Acts as a session manager, grouping connections together */ typedef struct @@ -125,6 +125,6 @@ extern void PoolManagerConnect(PoolHandle *handle, const char *database); extern int *PoolManagerGetConnections(List *nodelist); /* Retun connections back to the pool */ -extern void PoolManagerReleaseConnections(void); +extern void PoolManagerReleaseConnections(int ndisc, int* discard); #endif ----------------------------------------------------------------------- Summary of changes: contrib/pgbench/pgbench.c | 6 +- src/backend/access/transam/xact.c | 32 ++- src/backend/pgxc/pool/datanode.c | 49 ++-- src/backend/pgxc/pool/execRemote.c | 449 +++++++++++++++++------------------- src/backend/pgxc/pool/poolcomm.c | 15 +- src/backend/pgxc/pool/poolmgr.c | 71 ++++-- src/backend/tcop/postgres.c | 4 +- src/backend/utils/sort/tuplesort.c | 8 +- src/include/pgxc/datanode.h | 5 +- src/include/pgxc/execRemote.h | 5 +- src/include/pgxc/poolmgr.h | 6 +- 11 files changed, 347 insertions(+), 303 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P <mic...@us...> - 2010-08-04 23:38:17
|
Project "Postgres-XC". The branch, master has been updated via 086c5c6be32d4ca9232523cd64caf6d29aaac42c (commit) from d7ca431066efe320107581186ab853b28fa5f7a7 (commit) - Log ----------------------------------------------------------------- commit 086c5c6be32d4ca9232523cd64caf6d29aaac42c Author: Michael P <mic...@us...> Date: Thu Aug 5 08:36:24 2010 +0900 Correction of bugs in pgxc_ddl reported by Bug report 3039166 in Source Forge Tracker. Those bugs were linked with string management problems in the script. diff --git a/src/bin/scripts/pgxc_ddl b/src/bin/scripts/pgxc_ddl index efc2f69..2442595 100644 --- a/src/bin/scripts/pgxc_ddl +++ b/src/bin/scripts/pgxc_ddl @@ -125,17 +125,17 @@ fi hosts=`cat $PGXC_CONF | grep coordinator_hosts | cut -d "'" -f 2` ports=`cat $PGXC_CONF | grep coordinator_ports | cut -d "'" -f 2` folders=`cat $PGXC_CONF | grep coordinator_folders | cut -d "'" -f 2` -if [ "hosts" = "" ] +if [ "$hosts" = "" ] then echo "coordinator_hosts not defined in pgxc.conf" exit 2 fi -if [ "ports" = "" ] +if [ "$ports" = "" ] then echo "coordinator_ports not defined in pgxc.conf" exit 2 fi -if [ "folders" = "" ] +if [ "$folders" = "" ] then echo "coordinator_folders not defined in pgxc.conf" exit 2 @@ -276,7 +276,7 @@ fi #Main process begins #Check if the database is defined, This could lead to coordinator being stopped uselessly -if [ $DB_NAME != "" ] +if [ "$DB_NAME" != "" ] then #Simply launch a fake SQL on the Database wanted $PSQL_CLIENT -h ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -c 'select now()' -d $DB_NAME; err=$? ----------------------------------------------------------------------- Summary of changes: src/bin/scripts/pgxc_ddl | 8 ++++---- 1 files changed, 4 insertions(+), 4 deletions(-) hooks/post-receive -- Postgres-XC |
From: Koichi S. <koi...@us...> - 2010-07-22 02:33:49
|
Project "website". The branch, master has been updated via 8c2cfec1e2cf6263a2a1bbea33d09643fb6a942a (commit) via 35cd128872b78192c961c85a3982cee3ec2c2ca0 (commit) via d481c92b1778158839d16077325eb3e6052b83d9 (commit) from b318cefa94aafbf4724f619d2a29941cb5e3d615 (commit) - Log ----------------------------------------------------------------- commit 8c2cfec1e2cf6263a2a1bbea33d09643fb6a942a Author: Koichi Suzuki <koichi@willey.(none)> Date: Thu Jul 22 11:35:09 2010 +0900 Modified: roadmap.html Corrected upcoming version numbers and schedule. diff --git a/roadmap.html b/roadmap.html index 65167e8..4f8802e 100755 --- a/roadmap.html +++ b/roadmap.html @@ -62,9 +62,9 @@ Upcoming Releases and Features Current plan of future releases and features are as follows: </p> -<!-- ==== For version 1.0 ==== --> +<!-- ==== For version 0.9.3 ==== --> <h4> -Version 1.0 (Late in September, 2010) +Version 0.9.3 (Late in September, 2010) </h4> <p class="inner"> @@ -82,9 +82,9 @@ Forward Cursor (w/o <code>ORDER BY</code>)<br> subqueries<br> </p> -<!-- ==== Beyond Version 1.0 ==== --> +<!-- ==== For Version 1.0 ==== --> <h4> -Beyond Version 1.0 +Version 1.0 (Late in December, 2010) </h4> <p class="inner"> @@ -106,6 +106,17 @@ Tuple relocation (distrubute key update)<br> Performance improvement <br> Regression tests </p> + +<!-- === Beyond Version 1.0 === ---> +<h4> +Beyond Version 1.0 +</h4> + +<p class="inner"> +HA Capability<br> +GTM-Standby<br> +</p> + </body> </html> commit 35cd128872b78192c961c85a3982cee3ec2c2ca0 Merge: d481c92 b318cef Author: Koichi Suzuki <koichi@willey.(none)> Date: Thu Jul 22 11:25:58 2010 +0900 Merge branch 'master' of ssh://postgres-xc.git.sourceforge.net/gitroot/postgres-xc/pgxcweb commit d481c92b1778158839d16077325eb3e6052b83d9 Author: Koichi Suzuki <koichi@willey.localdomain> Date: Wed Jun 16 11:00:40 2010 +0900 New file: materials/Postgres-XC_20100521.pdf Added a likn to the above file, PGCon2010 presentation materials. diff --git a/download.html b/download.html index cfe22b6..d2b6a44 100755 --- a/download.html +++ b/download.html @@ -159,6 +159,13 @@ Description of the outline of Postgres-XC internals.   </h4> +<!-- PGCon2010 Presentation Materials --> +<h4> +<a href="materials/Postgres-XC_20100521.pdf"> +Presentation material for PGCon2010. +</a> + +<!-- previous versions --> <h4> <a href="prev_vers/version0_9.html" target="main">Previous Versions</a>   diff --git a/download.html b/download.html~ similarity index 100% copy from download.html copy to download.html~ diff --git a/materials/Postgres-XC_20100521.pdf b/materials/Postgres-XC_20100521.pdf new file mode 100644 index 0000000..6915b3f Binary files /dev/null and b/materials/Postgres-XC_20100521.pdf differ ----------------------------------------------------------------------- Summary of changes: download.html | 7 +++ download.html => download.html~ | 78 ++++++++++++++++++------------------ materials/Postgres-XC_20100521.pdf | Bin 0 -> 1087782 bytes roadmap.html | 19 +++++++-- 4 files changed, 61 insertions(+), 43 deletions(-) copy download.html => download.html~ (66%) create mode 100644 materials/Postgres-XC_20100521.pdf hooks/post-receive -- website |
From: Michael P <mic...@us...> - 2010-07-22 02:10:58
|
Project "website". The branch, master has been updated via b318cefa94aafbf4724f619d2a29941cb5e3d615 (commit) via 84caa6fc61b61ea5db46269f4889615b1fa09f76 (commit) from 95fa0dca242148522ad78a1b0df1e1ebae9397fc (commit) - Log ----------------------------------------------------------------- commit b318cefa94aafbf4724f619d2a29941cb5e3d615 Author: Michael P <mic...@us...> Date: Thu Jul 22 11:09:25 2010 +0900 Events and current release number corrected. Some typing issues with members corrected diff --git a/events.html b/events.html index 52fee74..1cb57b5 100755 --- a/events.html +++ b/events.html @@ -12,24 +12,28 @@ ==== UPCOMING EVENTS ==== --> <h2 class="plain">Events</h2> -<!-- CHAR(10) --> +<p class="plain"> +Upcoming events to be decided soon! +</p> + +<!-- Event title --> +<!-- <p class="plain"> Jul.1 to 3, 2010, -<a href="http://www.char10.org/" target="_blank"> -CHAR(10) +<a href="http link to this event" target="_blank"> +Event title </a> -conference dedicated for PostgreSQL cluster. +Description of this event. </p> +--> <!-- UPDATES --> -<h2 class="plain"> -Updates -</h2> -<!-- Postgres-XC 0.9.1 download --> +<h2 class="plain">Updates</h2> +<!-- Postgres-XC 0.9.2 download --> <p class="plain"> -Postgres-XC 0.9.1 is now available!! Download -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/pgxc_v0_9_1.tar.gz/download" target="_blank"> +Postgres-XC 0.9.2 is now available!! Download +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/pgxc_v0_9_2.tar.gz/download" target="_blank"> here. </a> </p> diff --git a/members.html b/members.html index 17ae2a0..c230c54 100755 --- a/members.html +++ b/members.html @@ -26,31 +26,31 @@ Postgres-XC development team <h4>Koichi Suzuki</h4> <p class="inner"> -Project leader and architect. -His background includes object relational database engine (UniSQL) and +Project leader and architect.<br> +His background includes object relational database engine (UniSQL) and<br> PostgreSQL development. </p> <h4>Mason Sharp</h4> <p class="inner"> -Architect and development leader. -Coordinator developer. -He is also the main architect of GridSQL database cluster. +Architect and development leader.<br> +Coordinator developer.<br> +He is also the main architect of GridSQL database cluster.<br> </p> <h4>Pavan Deolasee</h4> <p class="inner"> -Global Transaction Manager developer. -He is well known as HOT developer in PostgreSQL. -He is also helping in source code review and PostgreSQL internals. +Global Transaction Manager developer.<br> +He is well known as HOT developer in PostgreSQL.<br> +He is also helping in source code review and PostgreSQL internals.<br> </p> <h4>Andrei Martsinchyk</h4> <p class="inner"> -Data Node and connection pooling developer. +Data Node and connection pooling developer.<br> He is also GridSQL developer and is now developping aggregate functions and other cross-node operation. </p> @@ -58,8 +58,9 @@ functions and other cross-node operation. <h4>Michael Paquier</h4> <p class="inner"> -Coordinator feature developer. Currently working on user-defined function and DDLs. -He helped in modifying DBT-1 benchmark for Postgres-XC. +Coordinator feature developer.<br> +Currently working on user-defined function, Sequence handling and Global values.<br> +He helped in modifying DBT-1 benchmark for Postgres-XC.<br> He also contributed to enhance pgbench and 2PC. </p> @@ -72,7 +73,7 @@ Test, performance evaluation and analysis, related documents and utilities. <h4>Devrim Gunduz</h4> <p class="inner"> -Binary buiding for releases. He is also developping binary packages of PostgreSQL. +Binary buiding for releases.<br> He is also developping binary packages of PostgreSQL. </p> </body> commit 84caa6fc61b61ea5db46269f4889615b1fa09f76 Author: Michael P <mic...@us...> Date: Thu Jul 22 10:55:52 2010 +0900 Website update with Postggres-XC 0.9.2 release diff --git a/download.html b/download.html index cfe22b6..584eab1 100755 --- a/download.html +++ b/download.html @@ -25,8 +25,8 @@ List of Release Materials </h3> <p> -The current release includes the following materials. -Please note that documentation is not included in the source material. +The current release includes the following materials.<br> +Please note that documentation is not included in the source material.<br> Please download documentation from <a href="https://sourceforge.net/projects/postgres-xc/files/" target="_blank"> the project download page. @@ -36,121 +36,121 @@ the project download page. Please also note tarball files do not include Postgres-XC documents. </p> -<!-- Documents of version 0.9.1 --> +<!-- Documents of version 0.9.2 --> <h4> -Version 0.9.1 +Version 0.9.2 </h4> <p> <ul> -<!-- tarball --> +<!-- tarball of 0.9.2, main download--> <li> -<code>pgxc_v0.9.1.tar.gz</code>:  -Latest version of Postgres-XC available. +<code>pgxc_v0.9.2.tar.gz</code>: <br> +Latest version of Postgres-XC available.<br> Please note that Postgres-XC documentation is not included in this file. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/pgxc_v0_9_1.tar.gz/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/pgxc_v0_9_2.tar.gz/download" target="_blank"> (download) </a> </li> <!-- tarball (diff) --> <li> -<code>PGXC_v0_9_1-PG_REL8_4_3.patch.gz</code>:  +<code>PGXC_v0_9_2-PG_REL8_4_3.patch.gz</code>: <br> The same material as above, but this file includes only the patch to apply -to the PostgreSQL 8.4.3 release source code. +to the PostgreSQL 8.4.3 release source code.<br> It is useful if you would like to see just a difference between PostgreSQL -and Postgres-XC. +and Postgres-XC.<br> No Postgres-XC documentation is included in this file either. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PGXC_v0_9_1-PG_REL8_4_3.patch.gz/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PGXC_v0_9_2-PG_REL8_4_3.patch.gz/download" target="_blank"> (download) </a> </li> <!-- License --> <li> -<code>COPYING</code>:  +<code>COPYING</code>: <br> License description. Postgres-XC is distributed under LGPL version 2.1 ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/COPYING/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/COPYING/download" target="_blank"> (download) </a> </li> <!-- Files --> <li> -<code>FILES</code>:  -Description of files included in Postgres-XC 0.9.1 release. +<code>FILES</code>: <br> +Description of files included in Postgres-XC 0.9.2 release. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/FILES/download" target="_blank"> -(download) -</a> -</li> - -<!-- Readme --> -<li> -<code>README</code>:  -Overview of the release. -⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/README/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/FILES/download" target="_blank"> (download) </a> </li> <!-- Reference Manual --> <li> -<code>PG-XC_ReferenceManual_v0_9_1.pdf</code>:  +<code>PG-XC_ReferenceManual_v0_9_2.pdf</code>: <br> Reference of Postgres-XC extension. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_ReferenceManual_v0_9_1.pdf/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_ReferenceManual_v0_9_2.pdf/download" target="_blank"> (download) </a> </li> <!-- pgbench Tutorial Manual --> <li> -<code>PG-XC_pgbench_Tutorial_v0_9_1.pdf</code>:  +<code>PG-XC_pgbench_Tutorial_v0_9_2.pdf</code>: <br> Step by step description how to build and configure pgbench to run with Postgres-XC. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_pgbench_Tutorial_v0_9_1.pdf/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_pgbench_Tutorial_v0_9_2.pdf/download" target="_blank"> (download) </a> </li> <!-- DBT-1 Tutorial Manual --> <li> -<code>PG-XC_DBT1_Tutorial_v0_9_1.pdf</code>:  +<code>PG-XC_DBT1_Tutorial_v0_9_2.pdf</code>: <br> Step by step description how to build and configure DBT-1 to run with Postgres-XC. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_DBT1_Tutorial_v0_9_1.pdf/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_DBT1_Tutorial_v0_9_2.pdf/download" target="_blank"> (download) </a> </li> <!-- Install Manual --> <li> -<code>PG-XC_InstallManual_v0_9_1.pdf</code>:  +<code>PG-XC_InstallManual_v0_9_2.pdf</code>: <br> Step by step description how to build, install and configure Postgres-XC. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_InstallManual_v0_9_1.pdf/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_InstallManual_v0_9_2.pdf/download" target="_blank"> +(download) +</a> +</li> + +<!-- SQL limitation manual --> +<li> +<code>PG-XC_SQL_Limitations_v0_9_2.pdf</code>: <br> +SQL restrictions available for Postgres-XC 0.9.2. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_SQL_Limitations_v0_9_2.pdf/download" target="_blank"> (download) </a> </li> <!-- Architecture Document --> <li> -<code>PG-XC_Architecture_v0_9.pdf</code>:  +<code>PG-XC_Architecture_v0_9.pdf</code>: <br> Description of the outline of Postgres-XC internals. ⇒ -<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PG-XC_Architecture.pdf/download" target="_blank"> +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PG-XC_Architecture.pdf/download" target="_blank"> (download) </a> </li> -</ul> +</ul> </p> <!--div align="left" style="font-size:95%;"--> @@ -166,4 +166,4 @@ Description of the outline of Postgres-XC internals. </div> </body> -</html> \ No newline at end of file +</html> diff --git a/prev_vers/version0_9.html b/prev_vers/version0_9.html index 83e008d..4ed4cf1 100644 --- a/prev_vers/version0_9.html +++ b/prev_vers/version0_9.html @@ -32,8 +32,9 @@ Version 0.9.0 <ul> <!-- tarball --> <li> -<code>pgxc_v0.9.tar.gz</code>:  -This is a collection of source materials used to build the binaries. +<code>pgxc_v0.9.tar.gz</code>: <br> +Previous version of Postgres-XC released in April 2010.<br> +This is a collection of source materials used to build the binaries.<br> Please note that Postgres-XC documentation is not included in this file. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/pgxc_v0.9.tar.gz/download" target="_blank"> @@ -41,11 +42,12 @@ Please note that Postgres-XC documentation is not included in this file. </a> </li> -<code>PGXC-PG_REL8_4_3.patch.gz</code>:  +<li> +<code>PGXC-PG_REL8_4_3.patch.gz</code>: <br> The same material as above, but this file includes only the patch to apply -to the PostgreSQL 8.4.3 release source code. +to the PostgreSQL 8.4.3 release source code.<br> It is useful if you would like to see just a difference between PostgreSQL -and Postgres-XC. +and Postgres-XC.<br> No Postgres-XC documentation is included in this file either. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PGXC-PG_REL8_4_3.patch.gz/download" target="_blank"> @@ -55,7 +57,7 @@ No Postgres-XC documentation is included in this file either. <!-- License --> <li> -<code>COPYING</code>:  +<code>COPYING</code>: <br> License description. Postgres-XC is distributed under LGPL version 2.1 ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/COPYING/download" target="_blank"> @@ -65,7 +67,7 @@ License description. Postgres-XC is distributed under LGPL version 2.1 <!-- Readme --> <li> -<code>README</code>:  +<code>README</code>: <br> Overview of the release. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/README/download" target="_blank"> @@ -75,7 +77,7 @@ Overview of the release. <!-- Reference Manual --> <li> -<code>ReferenceManual.pdf</code>:  +<code>ReferenceManual.pdf</code>: <br> Reference of Postgres-XC extension. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/ReferenceManual.pdf/download" target="_blank"> @@ -85,7 +87,7 @@ Reference of Postgres-XC extension. <!-- Tutorial Manual --> <li> -<code>PG-XC_TutorialManual.pdf</code>:  +<code>PG-XC_TutorialManual.pdf</code>: <br> Step by step description how to build and configure DBT-1 to run with Postgres-XC. ⇒ @@ -96,7 +98,7 @@ Postgres-XC. <!-- Install Manual --> <li> -<code>PG-XC_InstallManual_Revision1.pdf</code>:  +<code>PG-XC_InstallManual_Revision1.pdf</code>: <br> Step by step description how to build, install and configure Postgres-XC. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PG-XC_InstallManual_Revision1.pdf/download" target="_blank"> @@ -106,14 +108,133 @@ Step by step description how to build, install and configure Postgres-XC. <!-- Architecture Document --> <li> -<code>PG-XC_Architecture.pdf</code>:  +<code>PG-XC_Architecture.pdf</code>: <br> +Description of the outline of Postgres-XC internals. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PG-XC_Architecture.pdf/download" target="_blank"> +(download) +</a> +</li> +</ul> +</p> + +<!-- Documents of version 0.9.1 --> +<h4> +Version 0.9.1 +</h4> + +<p> +<ul> +<!-- tarball --> +<li> +<code>pgxc_v0.9.1.tar.gz</code>: <br> +Previous version of Postgres-XC released in May 2010.<br> +Please note that Postgres-XC documentation is not included in this file. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/pgxc_v0_9_1.tar.gz/download" target="_blank"> +(download) +</a> +</li> + +<!-- tarball (diff) --> +<li> +<code>PGXC_v0_9_1-PG_REL8_4_3.patch.gz</code>: <br> +The same material as above, but this file includes only the patch to apply +to the PostgreSQL 8.4.3 release source code.<br> +It is useful if you would like to see just a difference between PostgreSQL +and Postgres-XC.<br> +No Postgres-XC documentation is included in this file either. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PGXC_v0_9_1-PG_REL8_4_3.patch.gz/download" target="_blank"> +(download) +</a> +</li> + +<!-- License --> +<li> +<code>COPYING</code>: <br> +License description. Postgres-XC is distributed under LGPL version 2.1 +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/COPYING/download" target="_blank"> +(download) +</a> +</li> + +<!-- Files --> +<li> +<code>FILES</code>: <br> +Description of files included in Postgres-XC 0.9.1 release. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/FILES/download" target="_blank"> +(download) +</a> +</li> + +<!-- Readme --> +<li> +<code>README</code>: <br> +Overview of the release. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/README/download" target="_blank"> +(download) +</a> +</li> + +<!-- Reference Manual --> +<li> +<code>PG-XC_ReferenceManual_v0_9_1.pdf</code>: <br> +Reference of Postgres-XC extension. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_ReferenceManual_v0_9_1.pdf/download" target="_blank"> +(download) +</a> +</li> + +<!-- pgbench Tutorial Manual --> +<li> +<code>PG-XC_pgbench_Tutorial_v0_9_1.pdf</code>: <br> +Step by step description how to build and configure pgbench to run with +Postgres-XC. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_pgbench_Tutorial_v0_9_1.pdf/download" target="_blank"> +(download) +</a> +</li> + + +<!-- DBT-1 Tutorial Manual --> +<li> +<code>PG-XC_DBT1_Tutorial_v0_9_1.pdf</code>: <br> +Step by step description how to build and configure DBT-1 to run with +Postgres-XC. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_DBT1_Tutorial_v0_9_1.pdf/download" target="_blank"> +(download) +</a> +</li> + +<!-- Install Manual --> +<li> +<code>PG-XC_InstallManual_v0_9_1.pdf</code>: <br> +Step by step description how to build, install and configure Postgres-XC. +⇒ +<a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.1/PG-XC_InstallManual_v0_9_1.pdf/download" target="_blank"> +(download) +</a> +</li> + +<!-- Architecture Document --> +<li> +<code>PG-XC_Architecture_v0_9.pdf</code>: <br> Description of the outline of Postgres-XC internals. ⇒ <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9/PG-XC_Architecture.pdf/download" target="_blank"> (download) </a> </li> + </ul> </p> + </body> -</html> \ No newline at end of file +</html> diff --git a/roadmap.html b/roadmap.html index 8299fbd..65167e8 100755 --- a/roadmap.html +++ b/roadmap.html @@ -33,11 +33,22 @@ similar to PostgreSQL, except for two phase commit (2PC) and savepoints. (XC uses 2PC for internal use). </p> <p> -On the other hand, Postgres-XC needs to enhance support for general statements. -As of Version 0.9.1, Postgres-XC supports statements which can be executed -on a single data node, or on multiple nodes but as a single step. -It does not support yet complex statements such as -subquery, view, ORDER BY, DISTINCT or +On the other hand, Postgres-XC needs to enhance support for general statements.<br> +As of Version 0.9.2, Postgres-XC supports statements which can be executed +on a single data node, or on multiple nodes but as a single step.<br> +This new version adds support for: +- views<br> +- extra DDLs<br> +- ORDER BY/DISTINCT<br> +- pg_dump, pg_restore<br> +- sequence full support with GTM<br> +- basic stored function support.<br> +- Cold synchronization of Coordinator's Catalog files<br> +However there are some limitations please refer to <a href="https://sourceforge.net/projects/postgres-xc/files/Version_0.9.2/PG-XC_SQL_Limitations_v0_9_2.pdf/download" target="_blank"> +SQL Limitations </a> document for further details. +</p> +<p> +There is no support yet for <code>SELECT</code> in <code>FROM</code> clause. Support for <code>CURSOR</code> is a future issue too. </p> @@ -53,41 +64,27 @@ Current plan of future releases and features are as follows: <!-- ==== For version 1.0 ==== --> <h4> -Version 1.0 (Late in July, 2010) +Version 1.0 (Late in September, 2010) </h4> -<p class="inner"> -<code>ORDER BY</code><br> -<code>DISTINCT</code><br> -Stored functions<br> -subqueries<br> -Views<br> -Rules<br> -DDLs<br> -Regression tests<br> -<p> - -<!-- ==== For version 1.1 ==== --> -<h4> -Version 1.1 (Late in September, 2010) -</h4> - -<p class="inner"> +<p class="inner"> Cluster-wide installer<br> Cluster-wide operation utilities<br> Regression tests<br> Logical backup/restore (pg_dump, pg_restore)<br> Basic cross-node operation<br> TEMP Table<br> +Cursor support<br> Extended Query Protocol (for JDBC)<br> Global timestamp<br> Driver support (ECPG, JDBC, PHP, etc.)<br> Forward Cursor (w/o <code>ORDER BY</code>)<br> +subqueries<br> </p> -<!-- ==== Beyond Version 1.1 ==== --> +<!-- ==== Beyond Version 1.0 ==== --> <h4> -Beyond Version 1.1 +Beyond Version 1.0 </h4> <p class="inner"> ----------------------------------------------------------------------- Summary of changes: download.html | 78 ++++++++++++------------ events.html | 24 ++++--- members.html | 25 ++++---- prev_vers/version0_9.html | 145 +++++++++++++++++++++++++++++++++++++++++---- roadmap.html | 47 +++++++-------- 5 files changed, 221 insertions(+), 98 deletions(-) hooks/post-receive -- website |
From: Michael P <mic...@us...> - 2010-07-22 00:17:49
|
Project "Postgres-XC". The tag, v0.9.2 has been created at d7ca431066efe320107581186ab853b28fa5f7a7 (commit) - Log ----------------------------------------------------------------- commit d7ca431066efe320107581186ab853b28fa5f7a7 Author: Michael P <mic...@us...> Date: Thu Jul 22 08:59:07 2010 +0900 Support for cold synchronization of catalog table of coordinator. This cold solution is temporary. Hot synchronization will be introduced in one of Postgres-XC's next release. Cold synchronization method means that once a DDL is launched, all the coordinators are stopped. And then the catalog copy begins from a coordinator chosen by the user. It is also possible to synchronize catalogs without launching a DDL file on one coordinator. Options possible to use for this script -D locate the data folder, necessary to find pgxc.conf, containing the characteristics of all the coordinators -l to locate the folder where applications are -f for a DDL file -d for a Database name -n coordinator number where to launch DDl, number based on the one written in pgxc.conf -t base name of folder where to save configuration files, by default /tmp/pgxc_config, completed by $$ Synchronization uses a new configuration file called pgxc.conf gathering all the coordinator data, such as port number, data folder and host for each one. Please refer to Postgres-XC 0.9.2 reference manual for further details. ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P <mic...@us...> - 2010-07-22 00:17:21
|
Project "Postgres-XC". The branch, REL0_9_2_STABLE has been created at d7ca431066efe320107581186ab853b28fa5f7a7 (commit) - Log ----------------------------------------------------------------- ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P <mic...@us...> - 2010-07-22 00:07:57
|
Project "Postgres-XC". The branch, master has been updated via d7ca431066efe320107581186ab853b28fa5f7a7 (commit) from 0fdcc0b44b395df2e546ba90feaa0d656ad58f4d (commit) - Log ----------------------------------------------------------------- commit d7ca431066efe320107581186ab853b28fa5f7a7 Author: Michael P <mic...@us...> Date: Thu Jul 22 08:59:07 2010 +0900 Support for cold synchronization of catalog table of coordinator. This cold solution is temporary. Hot synchronization will be introduced in one of Postgres-XC's next release. Cold synchronization method means that once a DDL is launched, all the coordinators are stopped. And then the catalog copy begins from a coordinator chosen by the user. It is also possible to synchronize catalogs without launching a DDL file on one coordinator. Options possible to use for this script -D locate the data folder, necessary to find pgxc.conf, containing the characteristics of all the coordinators -l to locate the folder where applications are -f for a DDL file -d for a Database name -n coordinator number where to launch DDl, number based on the one written in pgxc.conf -t base name of folder where to save configuration files, by default /tmp/pgxc_config, completed by $$ Synchronization uses a new configuration file called pgxc.conf gathering all the coordinator data, such as port number, data folder and host for each one. Please refer to Postgres-XC 0.9.2 reference manual for further details. diff --git a/src/backend/Makefile b/src/backend/Makefile index df707e7..984c951 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -195,6 +195,7 @@ endif $(INSTALL_DATA) $(srcdir)/libpq/pg_hba.conf.sample '$(DESTDIR)$(datadir)/pg_hba.conf.sample' $(INSTALL_DATA) $(srcdir)/libpq/pg_ident.conf.sample '$(DESTDIR)$(datadir)/pg_ident.conf.sample' $(INSTALL_DATA) $(srcdir)/utils/misc/postgresql.conf.sample '$(DESTDIR)$(datadir)/postgresql.conf.sample' + $(INSTALL_DATA) $(srcdir)/utils/misc/pgxc.conf.sample '$(DESTDIR)$(datadir)/pgxc.conf.sample' $(INSTALL_DATA) $(srcdir)/access/transam/recovery.conf.sample '$(DESTDIR)$(datadir)/recovery.conf.sample' install-bin: postgres $(POSTGRES_IMP) installdirs @@ -248,8 +249,9 @@ endif $(MAKE) -C catalog uninstall-data $(MAKE) -C tsearch uninstall-data rm -f '$(DESTDIR)$(datadir)/pg_hba.conf.sample' \ + '$(DESTDIR)$(datadir)/pgxc.conf.sample' \ '$(DESTDIR)$(datadir)/pg_ident.conf.sample' \ - '$(DESTDIR)$(datadir)/postgresql.conf.sample' \ + '$(DESTDIR)$(datadir)/postgresql.conf.sample' \ '$(DESTDIR)$(datadir)/recovery.conf.sample' diff --git a/src/backend/utils/misc/pgxc.conf.sample b/src/backend/utils/misc/pgxc.conf.sample new file mode 100644 index 0000000..9dcc0c7 --- /dev/null +++ b/src/backend/utils/misc/pgxc.conf.sample @@ -0,0 +1,20 @@ +# ----------------------------- +# Postgres-XC configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# It describes the list of coordinators used in the cluster + +#------------------------------------------------------------------------------ +# POSTGRES-XC COORDINATORS +#------------------------------------------------------------------------------ + +#coordinator_hosts = 'localhost' # Host names or addresses of data nodes + # (change requires restart) +#coordinator_ports = '5451,5452' # Port numbers of coordinators + # (change requires restart) +#coordinator_folders = '/pgxc/data' # List of Data folders of coordinators + # (change require restart) \ No newline at end of file diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 2d0b244..b4dd50b 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -100,6 +100,9 @@ static char *shdesc_file; static char *hba_file; static char *ident_file; static char *conf_file; +#ifdef PGXC +static char *pgxc_conf_file; +#endif static char *conversion_file; static char *dictionary_file; static char *info_schema_file; @@ -1296,6 +1299,19 @@ setup_config(void) free(conflines); +#ifdef PGXC + /* pgxc.conf */ + + conflines = readfile(pgxc_conf_file); + + snprintf(path, sizeof(path), "%s/pgxc.conf", pg_data); + + writefile(path, conflines); + chmod(path, 0600); + + free(conflines); +#endif + check_ok(); } @@ -2810,6 +2826,9 @@ main(int argc, char *argv[]) set_input(&hba_file, "pg_hba.conf.sample"); set_input(&ident_file, "pg_ident.conf.sample"); set_input(&conf_file, "postgresql.conf.sample"); +#ifdef PGXC + set_input(&pgxc_conf_file, "pgxc.conf.sample"); +#endif set_input(&conversion_file, "conversion_create.sql"); set_input(&dictionary_file, "snowball_create.sql"); set_input(&info_schema_file, "information_schema.sql"); @@ -2826,12 +2845,18 @@ main(int argc, char *argv[]) "POSTGRES_SUPERUSERNAME=%s\nPOSTGRES_BKI=%s\n" "POSTGRES_DESCR=%s\nPOSTGRES_SHDESCR=%s\n" "POSTGRESQL_CONF_SAMPLE=%s\n" +#ifdef PGXC + "PGXC_CONF_SAMPLE=%s\n" +#endif "PG_HBA_SAMPLE=%s\nPG_IDENT_SAMPLE=%s\n", PG_VERSION, pg_data, share_path, bin_path, username, bki_file, desc_file, shdesc_file, conf_file, +#ifdef PGXC + pgxc_conf_file, +#endif hba_file, ident_file); if (show_setting) exit(0); @@ -2842,6 +2867,9 @@ main(int argc, char *argv[]) check_input(shdesc_file); check_input(hba_file); check_input(ident_file); +#ifdef PGXC + check_input(pgxc_conf_file); +#endif check_input(conf_file); check_input(conversion_file); check_input(dictionary_file); diff --git a/src/bin/scripts/Makefile b/src/bin/scripts/Makefile index c28a066..48f9c20 100644 --- a/src/bin/scripts/Makefile +++ b/src/bin/scripts/Makefile @@ -52,6 +52,8 @@ install: all installdirs $(INSTALL_PROGRAM) clusterdb$(X) '$(DESTDIR)$(bindir)'/clusterdb$(X) $(INSTALL_PROGRAM) vacuumdb$(X) '$(DESTDIR)$(bindir)'/vacuumdb$(X) $(INSTALL_PROGRAM) reindexdb$(X) '$(DESTDIR)$(bindir)'/reindexdb$(X) + $(INSTALL_PROGRAM) pgxc_ddl$(X) '$(DESTDIR)$(bindir)'/pgxc_ddl$(X) + chmod 555 '$(DESTDIR)$(bindir)'/pgxc_ddl$(X) installdirs: $(mkinstalldirs) '$(DESTDIR)$(bindir)' diff --git a/src/bin/scripts/pgxc_ddl b/src/bin/scripts/pgxc_ddl new file mode 100644 index 0000000..efc2f69 --- /dev/null +++ b/src/bin/scripts/pgxc_ddl @@ -0,0 +1,443 @@ +#!/bin/bash +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +#Scripts to launch DDL in PGXC cluster using a cold_backup method +#Be sure to have set a correct ssl environment in all the servers of the cluster + +#This script uses pgxc.conf as a base to find the settings of all the coordinators + +#Options possible to use for this script +# -D to locate the data folder, necessary to find pgxc.conf, containing the characteristics of all the coordinators +# -l to locate the folder where applications are +# -f for a DDL file +# -d for a Database name +# -n coordinator number where to launch DDl, number based on the one written in pgxc.conf +# -t base name of folder where to save configuration files, by default /tmp/pgxc_config, completed by $$ + +count=0 + +#Default options +#local folder used to save temporary the configuration files of coordinator's data folder being erased +CONFIG_FOLDER=/tmp/pgxc_config_files.$$ +PGXC_BASE= +#options to launch the coordinator +#don't forget to add -i as we are in a cluster :) +COORD_OPTIONS="-C -i" + +#----------------------------------------------------------------------- +# Option Management +#----------------------------------------------------------------------- +while getopts 'f:d:D:l:hn:t:' OPTION +do + count=$((count +2)) + case $OPTION in + d) #for a database name + DB_NAME="$OPTARG" + ;; + + D) #for a data folder, to find pgxc.conf + DATA_FOLDER="$OPTARG" + ;; + + f) #for a DDL file + DDL_FILE_NAME="$OPTARG" + ;; + + l) #To define folder where applications are if necessary + PGXC_BASE="$OPTARG"/ + ;; + + n) #for a coordinator number + COORD_NUM_ORIGIN="$OPTARG" + ;; + + h) printf "Usage: %s: [-d dbname] [-l bin folder] [-D data folder] [-n coord number] [-f ddl file] [-t save folder name in /tmp/]\n" $(basename $0) >&2 + exit 0 + ;; + t) #to set the name of the folder where to save conf files. All is mandatory saved in /tmp + CONFIG_FOLDER=/tmp/"$OPTARG" + ;; + + ?) printf "Usage: %s: [-d dbname] [-l bin folder] [-D data folder] [-n coord number] [-f ddl file] [-t save folder name in /tmp/]\n" $(basename $0) >&2 + exit 0 + ;; + esac +done + +if [ $# -lt "1" ] +then + echo "No arguments defined, you should try help -h" + exit 2 +fi + +#A couple of option checks +if [ "$count" -ne "$#" ] +then + echo "Arguments not correctly set, try -h for help" + exit 2 +fi + +if [ -z $COORD_NUM_ORIGIN ] +then + echo "Coordinator number not defined, mandatory -n argument missing" + exit 2 +fi +if [ -z $DATA_FOLDER ] +then + echo "Data folder not defined, mandatory -D argument missing" + exit 2 +fi + +#Check if Argument of -n is an integer +if [ ! $(echo "$COORD_NUM_ORIGIN" | grep -E "^[0-9]+$") ] + then + echo "Argument -n is not a valid integer" + exit 2 +fi + +#Check if DDL file exists +if [ "$DDL_FILE_NAME" != "" ] +then + if [ ! -e $DDL_FILE_NAME ] + then + echo "DDL file not defined" + exit 2 + fi + if [ -z $DB_NAME ] + then + echo "Dbname not defined, mandatory -d argument missing when using a ddl file" + exit 2 + fi +fi + +#----------------------------------------------------------------------- +# Begin to read the pgxc.conf to get coordinator characteristics +#----------------------------------------------------------------------- +PGXC_CONF=$DATA_FOLDER/pgxc.conf + +if [ ! -e $PGXC_CONF ] +then + echo "pgxc.conf not defined in the directory defined by -D" + exit 2 +fi + +#Find parameters +hosts=`cat $PGXC_CONF | grep coordinator_hosts | cut -d "'" -f 2` +ports=`cat $PGXC_CONF | grep coordinator_ports | cut -d "'" -f 2` +folders=`cat $PGXC_CONF | grep coordinator_folders | cut -d "'" -f 2` +if [ "hosts" = "" ] +then + echo "coordinator_hosts not defined in pgxc.conf" + exit 2 +fi +if [ "ports" = "" ] +then + echo "coordinator_ports not defined in pgxc.conf" + exit 2 +fi +if [ "folders" = "" ] +then + echo "coordinator_folders not defined in pgxc.conf" + exit 2 +fi + +#Check if the strings are using commas as separators +hosts_sep="${hosts//[^,]/}" +ports_sep="${ports//[^,]/}" +folders_sep="${folders//[^,]/}" +if [ "$hosts_sep" = "" ] +then + echo "coordinator_hosts should use commas as a separator" + exit 2 +fi +if [ "$ports_sep" = "" ] +then + echo "coordinator_ports should use commas as a separator" + exit 2 +fi +if [ "$folders_sep" = "" ] +then + echo "coordinator_folders should use commas as a separator" + exit 2 +fi + + +#----------------------------------------------------------------------- +# Fill in Arrays that are used for the process from pgxc configuration file +#----------------------------------------------------------------------- + +count=1 +#Coordinator list +host_local=`echo $hosts | cut -d "," -f $count` +while [ "$host_local" != "" ] +do + COORD_HOSTNAMES[$((count -1))]=`echo $host_local` + count=$((count +1)) + host_local=`echo $hosts | cut -d "," -f $count` +done +COORD_COUNT=${#COORD_HOSTNAMES[*]} + +#Port list corresponding to the coordinators +#If all the coordinators use the same port on different servers, +#it is possible to define that with a unique element array. +count=1 +port_local=`echo $ports | cut -d "," -f $count` +while [ "$port_local" != "" ] +do + COORD_PORTS[$((count -1))]=$port_local + count=$((count +1)) + port_local=`echo $ports | cut -d "," -f $count` +done +COORD_PORTS_COUNT=${#COORD_PORTS[*]} + +#Data folder list corresponding to the coordinators +#If all the coordinators use the same data folder name on different servers, +#it is possible to define that with a unique element array. +count=1 +folder_local=`echo $folders | cut -d "," -f $count` + +while [ "$folder_local" != "" ] +do + COORD_PGDATA[$((count -1))]=$folder_local + count=$((count +1)) + folder_local=`echo $folders | cut -d "," -f $count` +done +COORD_PGDATA_COUNT=${#COORD_PGDATA[*]} + + +#----------------------------------------------------------------------- +# Start DDL process +#----------------------------------------------------------------------- + +#It is supposed that the same bin folders are used among the servers +#to call postgres processes +#This can be customized by the user with option -l +COORD_SERVER_PROCESS=postgres +PGCTL_SERVER_PROCESS=pg_ctl +PSQL_CLIENT_PROCESS=psql + +COORD_SERVER=$PGXC_BASE$COORD_SERVER_PROCESS +PGCTL_SERVER=$PGXC_BASE$PGCTL_SERVER_PROCESS +PSQL_CLIENT=$PGXC_BASE$PSQL_CLIENT_PROCESS + +#reajust coord number with index number +COORD_NUM_ORIGIN=$((COORD_NUM_ORIGIN -1)) + +#check data validity +#Note: Add other checks here + +if [ $COORD_COUNT -eq "1" ] +then + echo "Are you sure you want to use this utility with one only coordinator??" + exit 2 +fi + +if [ $COORD_PGDATA_COUNT -ne $COORD_COUNT ] +then + echo "Number of pgdata folders must be the same as coordinator server number" + exit 2 +fi + +if [ $COORD_PORTS_COUNT -ne $COORD_COUNT ] +then + echo "Number of coordinator ports defined must be the same as coordinator server number" + exit 2 +fi + +#Check if coordinator number is not outbounds +if [ $COORD_NUM_ORIGIN -gt $((COORD_COUNT -1)) ] +then + echo "coordinator number is out of bounds" + exit 2 +fi +COORD_ORIG_INDEX=$COORD_NUM_ORIGIN + +#Check if the data folders are defined +for index in ${!COORD_HOSTNAMES[*]} +do + targethost=${COORD_HOSTNAMES[$index]} + targetdata=${COORD_PGDATA[$index]} + if [[ `ssh $targethost test -d $targetdata && echo exists` ]] + then + echo "defined directory exists for "$targethost + else + echo "defined directory does not exist for "$targethost + exit 2 + fi +done + +#Origin Coordinator Index has been found? +if [ -z $COORD_ORIG_INDEX ] +then + echo "origin coordinator is not in the coordinator list" + exit 2 +fi + +#Main process begins + +#Check if the database is defined, This could lead to coordinator being stopped uselessly +if [ $DB_NAME != "" ] +then + #Simply launch a fake SQL on the Database wanted + $PSQL_CLIENT -h ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -c 'select now()' -d $DB_NAME; err=$? + if [ $err -gt "0" ] + then + echo "Database not defined" + exit 2 + fi +fi + +#1) stop all the coordinators +echo "Stopping all the coordinators" +for index in ${!COORD_HOSTNAMES[*]} +do + targethost=${COORD_HOSTNAMES[$index]} + targetdata=${COORD_PGDATA[$index]} + echo ssh $targethost $PGCTL_SERVER stop -D $targetdata + ssh $targethost $PGCTL_SERVER stop -D $targetdata; err=$? + if [ $err -gt "0" ] + then + "pg_ctl couldn't stop server" + exit 2 + fi +done + +#If a DDL file is not set by the user, just synchronize the catalogs with the catalog of the chosen coordinator +if [ "$DDL_FILE_NAME" != "" ] +then + echo "-f activated, DDL being launched" + + #2) restart the one we want to launch DDL to... + echo ssh ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} $COORD_SERVER $COORD_OPTIONS -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -D ${COORD_PGDATA[$COORD_ORIG_INDEX]} + ssh ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} $COORD_SERVER $COORD_OPTIONS -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -D ${COORD_PGDATA[$COORD_ORIG_INDEX]} & + + #wait a little bit to be sure it switched on + sleep 3 + + #3) launch the DDL + #This has to be done depending on if the user has defined a file or a command + echo $PSQL_CLIENT -h ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -f $DDL_FILE_NAME -d $DB_NAME + $PSQL_CLIENT -h ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} -p ${COORD_PORTS[$COORD_ORIG_INDEX]} -f $DDL_FILE_NAME -d $DB_NAME; err=$? + if [ $err -gt "0" ] + then + echo "psql error, is Database defined?" + exit 2 + fi + + #4) Stop again the origin coordinator as we cannot copy the lock files to other coordinators + echo ssh ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} $PGCTL_SERVER stop -D ${COORD_PGDATA[$COORD_ORIG_INDEX]} + ssh ${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} $PGCTL_SERVER stop -D ${COORD_PGDATA[$COORD_ORIG_INDEX]}; err=$? + if [ $err -gt "0" ] + then + "pg_ctl couldn't stop server" + exit 2 + fi +fi + +#5) before copying the catalogs, save the configuration files or they are erased by the catalog copy +#make a copy of them in a folder in /tmp/pgxc_conf (default folder) +if [ -d $CONFIG_FOLDER ] +then + rm -rf $CONFIG_FOLDER +fi +mkdir $CONFIG_FOLDER + +for index in ${!COORD_HOSTNAMES[*]} +do + if [ $index -ne $COORD_ORIG_INDEX ] + then + targethost=${COORD_HOSTNAMES[$index]} + targetdata=${COORD_PGDATA[$index]} + echo scp -pr $targethost:$targetdata/postgresql.conf $CONFIG_FOLDER/postgresql.conf.$index + echo scp -pr $targethost:$targetdata/pg_hba.conf $CONFIG_FOLDER/pg_hba.conf.$index + scp -pr $targethost:$targetdata/postgresql.conf $CONFIG_FOLDER/postgresql.conf.$index; err=$? + if [ $err -gt "0" ] + then + echo "deleting saved configuration files" + rm -rf $CONFIG_FOLDER + echo "scp failed with "$targethost + exit 2 + fi + scp -pr $targethost:$targetdata/pg_hba.conf $CONFIG_FOLDER/pg_hba.conf.$index; err=$? + if [ $err -gt "0" ] + then + echo "deleting saved configuration files" + rm -rf $CONFIG_FOLDER + echo "scp failed with "$targethost + exit 2 + fi + fi +done + +#6) copy catalog files to all coordinators but not to the origin one +for index in ${!COORD_HOSTNAMES[*]} +do + if [ $index -ne $COORD_ORIG_INDEX ] + then + srchost=${COORD_HOSTNAMES[$COORD_ORIG_INDEX]} + srcdata=${COORD_PGDATA[$COORD_ORIG_INDEX]} + targethost=${COORD_HOSTNAMES[$index]} + targetdata=${COORD_PGDATA[$index]} + #First erase the data to have a nice cleanup + echo ssh $targethost rm -rf $targetdata + ssh $targethost rm -rf $targetdata + + #Just to be sure that catalog files of origin coordinator are copied well + echo scp -pr $srchost:$srcdata $targethost:$targetdata + scp -pr $srchost:$srcdata $targethost:$targetdata; err=$? + if [ $err -gt "0" ] + then + echo "deleting saved configuration files" + rm -rf $CONFIG_FOLDER + echo "scp failed with "$targethost + exit 2 + fi + fi +done + +#7) copy back the configuration files to the corresponding fresh folders +#but not the configuration files of the origin coordinator +for index in ${!COORD_HOSTNAMES[*]} +do + if [ $index -ne $COORD_ORIG_INDEX ] + then + targethost=${COORD_HOSTNAMES[$index]} + targetdata=${COORD_PGDATA[$index]} + echo scp -pr $CONFIG_FOLDER/postgresql.conf.$index $targethost:$targetdata/postgresql.conf + echo scp -pr $CONFIG_FOLDER/pg_hba.conf.$index $targethost:$targetdata/pg_hba.conf + scp -pr $CONFIG_FOLDER/postgresql.conf.$index $targethost:$targetdata/postgresql.conf; err=$? + if [ $err -gt "0" ] + then + echo "deleting saved configuration files" + rm -rf $CONFIG_FOLDER + echo "scp failed with "$targethost + exit 2 + fi + scp -pr $CONFIG_FOLDER/pg_hba.conf.$index $targethost:$targetdata/pg_hba.conf; err=$? + if [ $err -gt "0" ] + then + echo "deleting saved configuration files" + rm -rf $CONFIG_FOLDER + echo "scp failed with "$targethost + exit 2 + fi + fi +done + +#8) wait a little bit... +sleep 1 + +#9) restart all the other coordinators, origin coordinator has been stopped after DDL run +for index in ${!COORD_HOSTNAMES[*]} +do + echo ssh ${COORD_HOSTNAMES[$index]} $COORD_SERVER $COORD_OPTIONS -p ${COORD_PORTS[$index]} -D ${COORD_PGDATA[$index]} & + ssh ${COORD_HOSTNAMES[$index]} $COORD_SERVER $COORD_OPTIONS -p ${COORD_PORTS[$index]} -D ${COORD_PGDATA[$index]} & +done + +sleep 2 + +#Clean also the folder in tmp keeping the configuration files +rm -rf $CONFIG_FOLDER + +#10) finished :p +exit \ No newline at end of file ----------------------------------------------------------------------- Summary of changes: src/backend/Makefile | 4 +- src/backend/utils/misc/pgxc.conf.sample | 20 ++ src/bin/initdb/initdb.c | 28 ++ src/bin/scripts/Makefile | 2 + src/bin/scripts/pgxc_ddl | 443 +++++++++++++++++++++++++++++++ 5 files changed, 496 insertions(+), 1 deletions(-) create mode 100644 src/backend/utils/misc/pgxc.conf.sample create mode 100644 src/bin/scripts/pgxc_ddl hooks/post-receive -- Postgres-XC |
From: Michael P <mic...@us...> - 2010-07-13 01:23:31
|
Project "Postgres-XC". The branch, master has been updated via 0fdcc0b44b395df2e546ba90feaa0d656ad58f4d (commit) from d1efc186e0272a095ae14f4230b8da9ba49a24b7 (commit) - Log ----------------------------------------------------------------- commit 0fdcc0b44b395df2e546ba90feaa0d656ad58f4d Author: Michael P <mic...@us...> Date: Tue Jul 13 10:04:01 2010 +0900 Support for RENAME/DROP SCHEMA with sequences Since commit for the support of ALTER SEQUENCE, sequences use a global name in GTM based on: db_name.schema_name.sequence_name This commit permits to rename sequences on GTM if their schema's name is modified. This patch permits also to drop a sequence on GTM in the case that its schema is being dropped in cascade. diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 8090e2f..af57e68 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -53,6 +53,10 @@ #include "catalog/pg_user_mapping.h" #ifdef PGXC #include "catalog/pgxc_class.h" +#include "pgxc/pgxc.h" +#include "commands/sequence.h" +#include "gtm/gtm_c.h" +#include "access/gtm.h" #endif #include "commands/comment.h" #include "commands/dbcommands.h" @@ -338,6 +342,89 @@ performMultipleDeletions(const ObjectAddresses *objects, heap_close(depRel, RowExclusiveLock); } +#ifdef PGXC +/* + * Check type and class of the given object and rename it properly on GTM + */ +static void +doRename(const ObjectAddress *object, const char *oldname, const char *newname) +{ + switch (getObjectClass(object)) + { + case OCLASS_CLASS: + { + char relKind = get_rel_relkind(object->objectId); + + /* + * If we are here, a schema is being renamed, a sequence depends on it. + * as sequences' global name use the schema name, this sequence + * has also to be renamed on GTM. + */ + if (relKind == RELKIND_SEQUENCE && IS_PGXC_COORDINATOR) + { + Relation relseq = relation_open(object->objectId, AccessShareLock); + char *seqname = GetGlobalSeqName(relseq, NULL, oldname); + char *newseqname = GetGlobalSeqName(relseq, NULL, newname); + + /* We also need to rename this sequence on GTM, it has a global name ! */ + if (RenameSequenceGTM(seqname, newseqname) < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not rename sequence"))); + + pfree(seqname); + pfree(newseqname); + + relation_close(relseq, AccessShareLock); + } + } + default: + /* Nothing to do, this object has not to be renamed, end of the story... */ + break; + } +} + +/* + * performRename: used to rename objects + * on GTM depending on another object(s) + */ +void +performRename(const ObjectAddress *object, const char *oldname, const char *newname) +{ + Relation depRel; + ObjectAddresses *targetObjects; + int i; + + /* + * Check the dependencies on this object + * And rename object dependent if necessary + */ + + depRel = heap_open(DependRelationId, RowExclusiveLock); + + targetObjects = new_object_addresses(); + + findDependentObjects(object, + DEPFLAG_ORIGINAL, + NULL, /* empty stack */ + targetObjects, + NULL, + depRel); + + /* Check Objects one by one to see if some of them have to be renamed on GTM */ + for (i = 0; i < targetObjects->numrefs; i++) + { + ObjectAddress *thisobj = targetObjects->refs + i; + doRename(thisobj, oldname, newname); + } + + /* And clean up */ + free_object_addresses(targetObjects); + + heap_close(depRel, RowExclusiveLock); +} +#endif + /* * deleteWhatDependsOn: attempt to drop everything that depends on the * specified object, though not the object itself. Behavior is always @@ -1047,6 +1134,33 @@ doDeletion(const ObjectAddress *object) else heap_drop_with_catalog(object->objectId); } + +#ifdef PGXC + /* Drop the sequence on GTM */ + if (relKind == RELKIND_SEQUENCE && IS_PGXC_COORDINATOR) + { + /* + * The sequence has already been removed from coordinator, + * finish the stuff on GTM too + */ + /* PGXCTODO: allow the ability to rollback or abort dropping sequences. */ + + Relation relseq; + char *seqname; + /* + * A relation is opened to get the schema and database name as + * such data is not available before when dropping a function. + */ + relseq = relation_open(object->objectId, AccessShareLock); + seqname = GetGlobalSeqName(relseq, NULL, NULL); + + DropSequenceGTM(seqname); + pfree(seqname); + + /* Then close the relation opened previously */ + relation_close(relseq, AccessShareLock); + } +#endif /* PGXC */ break; } diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 0d047cf..5704c99 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -31,6 +31,9 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif static void AlterSchemaOwner_internal(HeapTuple tup, Relation rel, Oid newOwnerId); @@ -298,6 +301,26 @@ RenameSchema(const char *oldname, const char *newname) simple_heap_update(rel, &tup->t_self, tup); CatalogUpdateIndexes(rel, tup); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + ObjectAddress object; + Oid namespaceId; + + /* Check object dependency and see if there is a sequence. If yes rename it */ + namespaceId = GetSysCacheOid(NAMESPACENAME, + CStringGetDatum(oldname), + 0, 0, 0); + /* Create the object that will be checked for the dependencies */ + object.classId = NamespaceRelationId; + object.objectId = namespaceId; + object.objectSubId = 0; + + /* Rename all the objects depending on this schema */ + performRename(&object, oldname, newname); + } +#endif + heap_close(rel, NoLock); heap_freetuple(tup); } diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index ba30206..83ddbab 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -352,7 +352,7 @@ DefineSequence(CreateSeqStmt *seq) #ifdef PGXC /* PGXC_COORD */ if (IS_PGXC_COORDINATOR) { - char *seqname = GetGlobalSeqName(rel, NULL); + char *seqname = GetGlobalSeqName(rel, NULL, NULL); /* We also need to create it on the GTM */ if (CreateSequenceGTM(seqname, @@ -494,7 +494,7 @@ AlterSequenceInternal(Oid relid, List *options) #ifdef PGXC if (IS_PGXC_COORDINATOR) { - char *seqname = GetGlobalSeqName(seqrel, NULL); + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); /* We also need to create it on the GTM */ if (AlterSequenceGTM(seqname, @@ -587,7 +587,7 @@ nextval_internal(Oid relid) #ifdef PGXC /* PGXC_COORD */ if (IS_PGXC_COORDINATOR) { - char *seqname = GetGlobalSeqName(seqrel, NULL); + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); /* * Above, we still use the page as a locking mechanism to handle @@ -785,7 +785,7 @@ currval_oid(PG_FUNCTION_ARGS) #ifdef PGXC if (IS_PGXC_COORDINATOR) { - char *seqname = GetGlobalSeqName(seqrel, NULL); + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); result = (int64) GetCurrentValGTM(seqname); if (result < 0) @@ -911,7 +911,7 @@ do_setval(Oid relid, int64 next, bool iscalled) #ifdef PGXC if (IS_PGXC_COORDINATOR) { - char *seqname = GetGlobalSeqName(seqrel, NULL); + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); if (SetValGTM(seqname, next, iscalled) < 0) ereport(ERROR, @@ -1423,20 +1423,24 @@ init_params(List *options, bool isInit, */ char * -GetGlobalSeqName(Relation seqrel, const char *new_seqname) +GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schemaname) { char *seqname, *dbname, *schemaname, *relname; int charlen; /* Get all the necessary relation names */ dbname = get_database_name(seqrel->rd_node.dbNode); - schemaname = get_namespace_name(RelationGetNamespace(seqrel)); if (new_seqname) - relname = new_seqname; + relname = (char *) new_seqname; else relname = RelationGetRelationName(seqrel); + if (new_schemaname) + schemaname = (char *) new_schemaname; + else + schemaname = get_namespace_name(RelationGetNamespace(seqrel)); + /* Calculate the global name size including the dots and \0 */ charlen = strlen(dbname) + strlen(schemaname) + strlen(relname) + 3; seqname = (char *) palloc(charlen); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 33782c4..22fd416 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -768,29 +768,6 @@ RemoveRelations(DropStmt *drop) add_exact_object_address(&obj, objects); -#ifdef PGXC /* PGXC_COORD */ - /* PGXCTODO: allow the ability to rollback dropping sequences. */ - - /* Drop the sequence */ - if (IS_PGXC_COORDINATOR && classform->relkind == RELKIND_SEQUENCE) - { - Relation relseq; - char *seqname; - - /* - * A relation is opened to get the schema and database name as - * such data is not available before when dropping a function. - */ - relseq = relation_open(obj.objectId, AccessShareLock); - seqname = GetGlobalSeqName(relseq, NULL); - - DropSequenceGTM(seqname); - pfree(seqname); - - /* Then close the relation opened previously */ - relation_close(relseq, AccessShareLock); - } -#endif ReleaseSysCache(tuple); } @@ -2120,14 +2097,17 @@ RenameRelation(Oid myrelid, const char *newrelname, ObjectType reltype) if (IS_PGXC_COORDINATOR && (reltype == OBJECT_SEQUENCE || relkind == RELKIND_SEQUENCE)) /* It is possible to rename a sequence with ALTER TABLE */ { - char *seqname = GetGlobalSeqName(targetrelation, NULL); - char *newseqname = GetGlobalSeqName(targetrelation, newrelname); + char *seqname = GetGlobalSeqName(targetrelation, NULL, NULL); + char *newseqname = GetGlobalSeqName(targetrelation, newrelname, NULL); /* We also need to rename it on the GTM */ if (RenameSequenceGTM(seqname, newseqname) < 0) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("GTM error, could not rename sequence"))); + + pfree(seqname); + pfree(newseqname); } #endif diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 6965e2e..7dd2a7e 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -601,7 +601,6 @@ ProcessUtility(Node *parsetree, { uint64 processed; #ifdef PGXC - bool done; processed = DoCopy((CopyStmt *) parsetree, queryString, true); #else processed = DoCopy((CopyStmt *) parsetree, queryString): diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index a4049c3..74c6d15 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -162,6 +162,12 @@ extern void performDeletion(const ObjectAddress *object, extern void performMultipleDeletions(const ObjectAddresses *objects, DropBehavior behavior); +#ifdef PGXC +extern void performRename(const ObjectAddress *object, + const char *oldname, + const char *newname); +#endif + extern void deleteWhatDependsOn(const ObjectAddress *object, bool showNotices); diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index f54f74f..adb70ec 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -103,7 +103,7 @@ extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec); #ifdef PGXC -extern char *GetGlobalSeqName(Relation rel, const char *new_seqname); +extern char *GetGlobalSeqName(Relation rel, const char *new_seqname, const char *new_schemaname); #endif #endif /* SEQUENCE_H */ ----------------------------------------------------------------------- Summary of changes: src/backend/catalog/dependency.c | 114 +++++++++++++++++++++++++++++++++++++ src/backend/commands/schemacmds.c | 23 ++++++++ src/backend/commands/sequence.c | 20 ++++--- src/backend/commands/tablecmds.c | 30 ++-------- src/backend/tcop/utility.c | 1 - src/include/catalog/dependency.h | 6 ++ src/include/commands/sequence.h | 2 +- 7 files changed, 161 insertions(+), 35 deletions(-) hooks/post-receive -- Postgres-XC |
From: Pavan D. <pa...@us...> - 2010-07-12 07:45:51
|
Project "Postgres-XC". The branch, master has been updated via d1efc186e0272a095ae14f4230b8da9ba49a24b7 (commit) from 8ce8906c2d45e0aa1164c9beaedb2637853a2e03 (commit) - Log ----------------------------------------------------------------- commit d1efc186e0272a095ae14f4230b8da9ba49a24b7 Author: Pavan Deolasee <pav...@gm...> Date: Mon Jul 12 13:12:57 2010 +0530 Handling ALTER SEQUENCE at the GTM proxy as well. Michael Paquier. diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 75c7baf..f5f6e65 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -86,7 +86,7 @@ static void ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); static void ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); -static void ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, +static void ProcessSequenceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); static void GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, @@ -579,7 +579,6 @@ GTMProxy_ThreadMain(void *argp) char gtm_connect_string[1024]; elog(DEBUG3, "Starting the connection helper thread"); - /* * Create the memory context we will use in the main loop. @@ -595,7 +594,7 @@ GTMProxy_ThreadMain(void *argp) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, false); - + /* * Set up connection with the GTM server */ @@ -808,7 +807,7 @@ GTMProxy_ThreadMain(void *argp) ProcessCommand(thrinfo->thr_conn, thrinfo->thr_gtm_conn, &input_message); break; - + case 'X': case EOF: /* @@ -917,7 +916,7 @@ GTMProxyAddConnection(Port *port) { ereport(ERROR, (ENOMEM, - errmsg("Out of memory"))); + errmsg("Out of memory"))); return STATUS_ERROR; } @@ -942,31 +941,35 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, switch (mtype) { - case MSG_UNREGISTER_COORD: + case MSG_UNREGISTER_COORD: ProcessCoordinatorCommand(conninfo, gtm_conn, mtype, input_message); break; - case MSG_TXN_BEGIN: - case MSG_TXN_BEGIN_GETGXID: + case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN_GETGXID: case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: - case MSG_TXN_PREPARE: - case MSG_TXN_COMMIT: - case MSG_TXN_ROLLBACK: + case MSG_TXN_PREPARE: + case MSG_TXN_COMMIT: + case MSG_TXN_ROLLBACK: case MSG_TXN_GET_GXID: ProcessTransactionCommand(conninfo, gtm_conn, mtype, input_message); break; - case MSG_SNAPSHOT_GET: + case MSG_SNAPSHOT_GET: case MSG_SNAPSHOT_GXID_GET: ProcessSnapshotCommand(conninfo, gtm_conn, mtype, input_message); break; - case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_INIT: case MSG_SEQUENCE_GET_CURRENT: case MSG_SEQUENCE_GET_NEXT: + case MSG_SEQUENCE_GET_LAST: + case MSG_SEQUENCE_SET_VAL: case MSG_SEQUENCE_RESET: case MSG_SEQUENCE_CLOSE: - ProcessSeqeunceCommand(conninfo, gtm_conn, mtype, input_message); + case MSG_SEQUENCE_RENAME: + case MSG_SEQUENCE_ALTER: + ProcessSequenceCommand(conninfo, gtm_conn, mtype, input_message); break; default: @@ -1104,16 +1107,20 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; break; - case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN: case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: - case MSG_TXN_PREPARE: + case MSG_TXN_PREPARE: case MSG_TXN_GET_GXID: case MSG_SNAPSHOT_GXID_GET: - case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_INIT: case MSG_SEQUENCE_GET_CURRENT: case MSG_SEQUENCE_GET_NEXT: + case MSG_SEQUENCE_GET_LAST: + case MSG_SEQUENCE_SET_VAL: case MSG_SEQUENCE_RESET: case MSG_SEQUENCE_CLOSE: + case MSG_SEQUENCE_RENAME: + case MSG_SEQUENCE_ALTER: if ((res->gr_proxyhdr.ph_conid == InvalidGTMProxyConnID) || (res->gr_proxyhdr.ph_conid >= GTM_PROXY_MAX_CONNECTIONS) || (thrinfo->thr_all_conns[res->gr_proxyhdr.ph_conid] != cmdinfo->ci_conn)) @@ -1251,13 +1258,13 @@ ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, switch (mtype) { - case MSG_TXN_BEGIN_GETGXID: + case MSG_TXN_BEGIN_GETGXID: cmd_data.cd_beg.iso_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); cmd_data.cd_beg.rdonly = pq_getmsgbyte(message); - GTMProxy_CommandPending(conninfo, mtype, cmd_data); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); break; - case MSG_TXN_COMMIT: + case MSG_TXN_COMMIT: case MSG_TXN_ROLLBACK: cmd_data.cd_rc.isgxid = pq_getmsgbyte(message); if (cmd_data.cd_rc.isgxid) @@ -1281,7 +1288,7 @@ ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, memcpy(&cmd_data.cd_rc.handle, data, sizeof (GTM_TransactionHandle)); } pq_getmsgend(message); - GTMProxy_CommandPending(conninfo, mtype, cmd_data); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); break; case MSG_TXN_BEGIN: @@ -1291,7 +1298,7 @@ ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: case MSG_TXN_PREPARE: - GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); + GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); break; default: @@ -1336,7 +1343,7 @@ ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, memcpy(&cmd_data.cd_snap.handle, data, sizeof (GTM_TransactionHandle)); } pq_getmsgend(message); - GTMProxy_CommandPending(conninfo, mtype, cmd_data); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); } break; @@ -1351,7 +1358,7 @@ ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, } static void -ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, +ProcessSequenceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message) { /* ----------------------------------------------------------------------- Summary of changes: src/gtm/proxy/proxy_main.c | 55 ++++++++++++++++++++++++------------------- 1 files changed, 31 insertions(+), 24 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-07-07 13:34:41
|
Project "Postgres-XC". The branch, master has been updated via 8ce8906c2d45e0aa1164c9beaedb2637853a2e03 (commit) from 5800b1b7b84dac3759f25a4a37afcb2ed26a1a63 (commit) - Log ----------------------------------------------------------------- commit 8ce8906c2d45e0aa1164c9beaedb2637853a2e03 Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Wed Jul 7 15:33:45 2010 +0200 Fix a crash that may occur within the pooler when a data node crashes. Written by Andrei Matsinchyk diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 79106b5..6427da3 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -4,15 +4,15 @@ * * Connection pool manager handles connections to DataNodes * - * The pooler runs as a separate process and is forked off from a - * coordinator postmaster. If the coordinator needs a connection from a + * The pooler runs as a separate process and is forked off from a + * coordinator postmaster. If the coordinator needs a connection from a * data node, it asks for one from the pooler, which maintains separate * pools for each data node. A group of connections can be requested in - * a single request, and the pooler returns a list of file descriptors + * a single request, and the pooler returns a list of file descriptors * to use for the connections. * * Note the current implementation does not yet shrink the pool over time - * as connections are idle. Also, it does not queue requests; if a + * as connections are idle. Also, it does not queue requests; if a * connection is unavailable, it will simply fail. This should be implemented * one day, although there is a chance for deadlocks. For now, limiting * connections should be done between the application and coordinator. @@ -113,8 +113,8 @@ extern int pqReadReady(PGconn *conn); static volatile sig_atomic_t shutdown_requested = false; -/* - * Initialize internal structures +/* + * Initialize internal structures */ int PoolManagerInit() @@ -433,8 +433,8 @@ PoolManagerInit() } -/* - * Destroy internal structures +/* + * Destroy internal structures */ int PoolManagerDestroy(void) @@ -575,8 +575,8 @@ PoolManagerConnect(PoolHandle *handle, const char *database) } -/* - * Init PoolAgent +/* + * Init PoolAgent */ static void agent_init(PoolAgent *agent, const char *database, List *nodes) @@ -598,8 +598,8 @@ agent_init(PoolAgent *agent, const char *database, List *nodes) } -/* - * Destroy PoolAgent +/* + * Destroy PoolAgent */ static void agent_destroy(PoolAgent *agent) @@ -636,8 +636,8 @@ agent_destroy(PoolAgent *agent) } -/* - * Release handle to pool manager +/* + * Release handle to pool manager */ void PoolManagerDisconnect(PoolHandle *handle) @@ -653,8 +653,8 @@ PoolManagerDisconnect(PoolHandle *handle) } -/* - * Get pooled connections +/* + * Get pooled connections */ int * PoolManagerGetConnections(List *nodelist) @@ -759,7 +759,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) } -/* +/* * acquire connection */ static int * @@ -827,8 +827,8 @@ agent_acquire_connections(PoolAgent *agent, List *nodelist) } -/* - * Retun connections back to the pool +/* + * Retun connections back to the pool */ void PoolManagerReleaseConnections(void) @@ -972,8 +972,8 @@ destroy_database_pool(const char *database) } -/* - * Insert new database pool to the list +/* + * Insert new database pool to the list */ static void insert_database_pool(DatabasePool *databasePool) @@ -991,8 +991,8 @@ insert_database_pool(DatabasePool *databasePool) } -/* - * Find pool for specified database in the list +/* + * Find pool for specified database in the list */ static DatabasePool * @@ -1015,8 +1015,8 @@ find_database_pool(const char *database) } -/* - * Remove pool for specified database from the list +/* + * Remove pool for specified database from the list */ static DatabasePool * remove_database_pool(const char *database) @@ -1075,41 +1075,40 @@ acquire_connection(DatabasePool *dbPool, int node) } /* Check available connections */ - if (nodePool && nodePool->freeSize > 0) + while (nodePool && nodePool->freeSize > 0) { int poll_result; - while (nodePool->freeSize > 0) - { - slot = nodePool->slot[--(nodePool->freeSize)]; + slot = nodePool->slot[--(nodePool->freeSize)]; retry: - /* Make sure connection is ok */ - poll_result = pqReadReady(slot->conn); - - if (poll_result == 0) - break; /* ok, no data */ - else if (poll_result < 0) - { - if (errno == EAGAIN || errno == EINTR) - goto retry; + /* Make sure connection is ok */ + poll_result = pqReadReady(slot->conn); - elog(WARNING, "Error in checking connection, errno = %d", errno); - } - else - elog(WARNING, "Unexpected data on connection, cleaning."); + if (poll_result == 0) + break; /* ok, no data */ + else if (poll_result < 0) + { + if (errno == EAGAIN || errno == EINTR) + goto retry; - destroy_slot(slot); - /* Decrement current max pool size */ - (nodePool->size)--; - /* Ensure we are not below minimum size */ - grow_pool(dbPool, node - 1); + elog(WARNING, "Error in checking connection, errno = %d", errno); } + else + elog(WARNING, "Unexpected data on connection, cleaning."); + + destroy_slot(slot); + slot = NULL; + + /* Decrement current max pool size */ + (nodePool->size)--; + /* Ensure we are not below minimum size */ + grow_pool(dbPool, node - 1); } - else - ereport(LOG, - (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("connection pool is empty"))); + + if (slot == NULL) + elog(WARNING, "can not connect to data node %d", node); + return slot; } ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/pool/poolmgr.c | 101 +++++++++++++++++++-------------------- 1 files changed, 50 insertions(+), 51 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-07-07 13:33:07
|
Project "Postgres-XC". The branch, master has been updated via 5800b1b7b84dac3759f25a4a37afcb2ed26a1a63 (commit) via 73249fbb42f4c05de85181428a7ae143c0c8d254 (commit) from 47f4b06f6e25426bb775d7a7372309b68c7e1f47 (commit) - Log ----------------------------------------------------------------- commit 5800b1b7b84dac3759f25a4a37afcb2ed26a1a63 Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Wed Jul 7 15:31:15 2010 +0200 In Postgres-XC, the error stack may overflow because AbortTransaction may be called multiple times, each time calling DataNodeRollback, which may fail again if a data node is down. Instead, if we are already in an abort state, we do not bother repeating abort actions. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 757f99d..491d0d5 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2697,6 +2697,20 @@ AbortCurrentTransaction(void) } } +#ifdef PGXC +/* + * AbortCurrentTransactionOnce + * + * Abort transaction, but only if we have not already. + */ +void +AbortCurrentTransactionOnce(void) +{ + if (CurrentTransactionState->state != TRANS_ABORT) + AbortCurrentTransaction(); +} +#endif + /* * PreventTransactionChain * diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 4cb0b27..553a682 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3768,7 +3768,15 @@ PostgresMain(int argc, char *argv[], const char *username) /* * Abort the current transaction in order to recover. */ +#ifdef PGXC + /* + * Temporarily do not abort if we are already in an abort state. + * This change tries to handle the case where the error data stack fills up. + */ + AbortCurrentTransactionOnce(); +#else AbortCurrentTransaction(); +#endif /* * Now return to normal top-level context and clear ErrorContext for diff --git a/src/include/access/xact.h b/src/include/access/xact.h index fe69611..5bd157b 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -163,6 +163,9 @@ extern void CommandCounterIncrement(void); extern void ForceSyncCommit(void); extern void StartTransactionCommand(void); extern void CommitTransactionCommand(void); +#ifdef PGXC +extern void AbortCurrentTransactionOnce(void); +#endif extern void AbortCurrentTransaction(void); extern void BeginTransactionBlock(void); extern bool EndTransactionBlock(void); commit 73249fbb42f4c05de85181428a7ae143c0c8d254 Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Wed Jul 7 15:30:16 2010 +0200 Changed some error messages so that they will not be duplicates to better pinpoint some issues. diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1ee1d59..c6f9042 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -475,7 +475,7 @@ HandleCopyOutComplete(RemoteQueryState *combiner) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'c' message, current request type %d", combiner->request_type))); /* Just do nothing, close message is managed by the coordinator */ combiner->copy_out_count++; } @@ -559,7 +559,7 @@ HandleRowDescription(RemoteQueryState *combiner, char *msg_body, size_t len) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'T' message, current request type %d", combiner->request_type))); } /* Increment counter and check if it was first */ if (combiner->description_count++ == 0) @@ -583,7 +583,7 @@ HandleParameterStatus(RemoteQueryState *combiner, char *msg_body, size_t len) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'S' message, current request type %d", combiner->request_type))); } /* Proxy last */ if (++combiner->description_count == combiner->node_count) @@ -605,7 +605,7 @@ HandleCopyIn(RemoteQueryState *combiner) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'G' message, current request type %d", combiner->request_type))); } /* * The normal PG code will output an G message when it runs in the @@ -627,7 +627,7 @@ HandleCopyOut(RemoteQueryState *combiner) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'H' message, current request type %d", combiner->request_type))); } /* * The normal PG code will output an H message when it runs in the @@ -649,7 +649,7 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) if (combiner->request_type != REQUEST_TYPE_COPY_OUT) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'd' message, current request type %d", combiner->request_type))); /* If there is a copy file, data has to be sent to the local file */ if (combiner->copy_file) @@ -675,7 +675,7 @@ HandleDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type))); } /* @@ -943,7 +943,8 @@ data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, data_node_receive(count, to_receive, timeout); while (i < count) { - switch (handle_response(to_receive[i], combiner)) + int result = handle_response(to_receive[i], combiner); + switch (result) { case RESPONSE_EOF: /* have something to read, keep receiving */ i++; @@ -960,7 +961,7 @@ data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, /* Inconsistent responses */ ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes, result = %d, request type %d", result, combiner->request_type))); } } } @@ -1679,7 +1680,7 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* pfree(copy_connections); ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); + errmsg("Unexpected response from the data nodes when combining, request type %d", combiner->request_type))); } return processed; ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/xact.c | 14 ++++++++++++++ src/backend/pgxc/pool/execRemote.c | 21 +++++++++++---------- src/backend/tcop/postgres.c | 8 ++++++++ src/include/access/xact.h | 3 +++ 4 files changed, 36 insertions(+), 10 deletions(-) hooks/post-receive -- Postgres-XC |
From: andrei_mart <and...@us...> - 2010-07-05 06:12:31
|
Project "Postgres-XC". The branch, master has been updated via 47f4b06f6e25426bb775d7a7372309b68c7e1f47 (commit) from c61f6b7e606131d3963ed83bcfa40c000d2e0aab (commit) - Log ----------------------------------------------------------------- commit 47f4b06f6e25426bb775d7a7372309b68c7e1f47 Author: Andrei Martsinchyk <And...@en...> Date: Mon Jul 5 09:08:20 2010 +0300 Fixed a bug when searching terminating semicolon. Initial position was at \0, which is not considered as a witespace. Start from the character immediately before \0. diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 461f96a..002e710 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1675,11 +1675,11 @@ reconstruct_step_query(List *rtable, bool has_order_by, List *extra_sort, /* the same offset in the original string */ int offset = sql_from - sql; /* - * Remove terminating semicolon to be able to append extra - * order by entries. If query is submitted from client other than psql - * the terminator may not present. + * Truncate query at the position of terminating semicolon to be able + * to append extra order by entries. If query is submitted from client + * other than psql the terminator may not present. */ - char *end = step->sql_statement + strlen(step->sql_statement); + char *end = step->sql_statement + strlen(step->sql_statement) - 1; while(isspace((unsigned char) *end) && end > step->sql_statement) end--; if (*end == ';') ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 8 ++++---- 1 files changed, 4 insertions(+), 4 deletions(-) hooks/post-receive -- Postgres-XC |
From: andrei_mart <and...@us...> - 2010-07-02 16:03:41
|
Project "Postgres-XC". The branch, master has been updated via c61f6b7e606131d3963ed83bcfa40c000d2e0aab (commit) from 5d83e22e3cabc3d1e5dc425f492e4459b30a67a0 (commit) - Log ----------------------------------------------------------------- commit c61f6b7e606131d3963ed83bcfa40c000d2e0aab Author: Andrei Martsinchyk <And...@en...> Date: Fri Jul 2 18:51:15 2010 +0300 If expressions should be added to ORDER BY clause of the step query we search for terminating semicolon to determine position where expressions should be added. Added handling for the case if query is not terminated with a semicolon. Also, small optimization - use sorting on coordinator only if step is going to be executed on two or more nodes. diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 2cf488c..461f96a 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -51,7 +51,7 @@ typedef struct long constant; /* assume long PGXCTODO - should be Datum */ } Literal_Comparison; -/* Parent-Child joins for relations being joined on +/* Parent-Child joins for relations being joined on * their respective hash distribuion columns */ typedef struct @@ -114,7 +114,7 @@ typedef struct ColumnBase * the rtable for the particular query. This way we can use * varlevelsup to resolve Vars in nested queries */ -typedef struct XCWalkerContext +typedef struct XCWalkerContext { Query *query; bool isRead; @@ -325,7 +325,7 @@ get_numeric_constant(Expr *expr) * This is required because a RangeTblEntry may actually be another * type, like a join, and we need to then look at the joinaliasvars * to determine what the base table and column really is. - * + * * rtables is a List of rtable Lists. */ static ColumnBase* @@ -338,8 +338,8 @@ get_base_var(Var *var, XCWalkerContext *context) if (!AttrNumberIsForUserDefinedAttr(var->varattno)) return NULL; - /* - * Get the RangeTableEntry + /* + * Get the RangeTableEntry * We take nested subqueries into account first, * we may need to look further up the query tree. * The most recent rtable is at the end of the list; top most one is first. @@ -514,8 +514,8 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) *rel_loc_info2; Const *constant; Expr *checkexpr; - bool result = false; - bool is_and = false; + bool result = false; + bool is_and = false; Assert(context); @@ -534,7 +534,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) /* If we get here, that meant the previous call before recursing down did not * find the condition safe yet. * Since we pass down our context, this is the bit of code that will detect - * that we are using more than one relation in a condition which has not + * that we are using more than one relation in a condition which has not * already been deemed safe. */ Var *var_node = (Var *) expr_node; @@ -591,7 +591,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) } } - /* + /* * Look for equality conditions on partiioned columns, but only do so * if we are not in an OR or NOT expression */ @@ -743,7 +743,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) && IsHashColumn(rel_loc_info2, column_base2->colname)) { /* We found a partitioned join */ - Parent_Child_Join *parent_child = (Parent_Child_Join *) + Parent_Child_Join *parent_child = (Parent_Child_Join *) palloc0(sizeof(Parent_Child_Join)); parent_child->rel_loc_info1 = rel_loc_info1; @@ -762,7 +762,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) /* * At this point, there is some other type of join that * can probably not be executed on only a single node. - * Just return, as it may be updated later. + * Just return, as it may be updated later. * Important: We preserve previous * pgxc_join->join_type value, there may be multiple * columns joining two tables, and we want to make sure at @@ -787,7 +787,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) /* save parent-child count */ if (context->exec_nodes) - save_parent_child_count = list_length(context->conditions->partitioned_parent_child); + save_parent_child_count = list_length(context->conditions->partitioned_parent_child); context->exec_nodes = NULL; context->multilevel_join = false; @@ -824,14 +824,14 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) if (same_single_node (context->exec_nodes->nodelist, save_exec_nodes->nodelist)) return false; } - else + else /* use old value */ context->exec_nodes = save_exec_nodes; } - } else + } else { if (context->exec_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED) - return false; + return false; /* See if subquery safely joins with parent */ if (!is_multilevel) return true; @@ -993,8 +993,8 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) from_subquery_count++; - /* - * Recursively call for subqueries. + /* + * Recursively call for subqueries. * Note this also works for views, which are rewritten as subqueries. */ context->rtables = lappend(context->rtables, current_rtable); @@ -1012,7 +1012,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) if (current_nodes) current_usage_type = current_nodes->tableusagetype; - else + else /* could be complicated */ return true; @@ -1088,7 +1088,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; return false; - } + } /* Examine the WHERE clause, too */ if (examine_conditions_walker(query->jointree->quals, context)) @@ -1129,18 +1129,18 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) { rte = (RangeTblEntry *) lfirst(lc); - /* - * If the query is rewritten (which can be due to rules or views), - * ignore extra stuff. Also ignore subqueries we have processed + /* + * If the query is rewritten (which can be due to rules or views), + * ignore extra stuff. Also ignore subqueries we have processed */ if ((!rte->inFromCl && query->commandType == CMD_SELECT) || rte->rtekind != RTE_RELATION) continue; /* PGXCTODO - handle RTEs that are functions */ if (rtesave) - /* - * Too complicated, we have multiple relations that still - * cannot be joined safely + /* + * Too complicated, we have multiple relations that still + * cannot be joined safely */ return true; @@ -1209,7 +1209,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) */ Parent_Child_Join *parent_child; - parent_child = (Parent_Child_Join *) + parent_child = (Parent_Child_Join *) linitial(context->conditions->partitioned_parent_child); context->exec_nodes = GetRelationNodes(parent_child->rel_loc_info1, NULL, context->isRead); @@ -1218,7 +1218,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) if (from_query_nodes) { - if (!context->exec_nodes) + if (!context->exec_nodes) { context->exec_nodes = from_query_nodes; return false; @@ -1229,9 +1229,9 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) else if (from_query_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED || (same_single_node(from_query_nodes->nodelist, context->exec_nodes->nodelist))) return false; - else + else { - /* We allow views, where the (rewritten) subquery may be on all nodes, + /* We allow views, where the (rewritten) subquery may be on all nodes, * but the parent query applies a condition on the from subquery. */ if (list_length(query->jointree->fromlist) == from_subquery_count @@ -1674,9 +1674,16 @@ reconstruct_step_query(List *rtable, bool has_order_by, List *extra_sort, { /* the same offset in the original string */ int offset = sql_from - sql; - /* remove terminating semicolon */ - char *end = strrchr(step->sql_statement, ';'); - *end = '\0'; + /* + * Remove terminating semicolon to be able to append extra + * order by entries. If query is submitted from client other than psql + * the terminator may not present. + */ + char *end = step->sql_statement + strlen(step->sql_statement); + while(isspace((unsigned char) *end) && end > step->sql_statement) + end--; + if (*end == ';') + *end = '\0'; appendStringInfoString(buf, step->sql_statement + offset); } @@ -2069,7 +2076,9 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) /* * Add sortring to the step */ - if (query->sortClause || query->distinctClause) + if (query_plan->exec_loc_type == EXEC_ON_DATA_NODES && + list_length(query_step->exec_nodes->nodelist) > 1 && + (query->sortClause || query->distinctClause)) make_simple_sort_from_sortclauses(query, query_step); /* ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 75 ++++++++++++++++++++++----------------- 1 files changed, 42 insertions(+), 33 deletions(-) hooks/post-receive -- Postgres-XC |
From: andrei_mart <and...@us...> - 2010-06-30 19:55:24
|
Project "Postgres-XC". The branch, master has been updated via 5d83e22e3cabc3d1e5dc425f492e4459b30a67a0 (commit) via a1b6404cb6c214e9df075e3d827e8384555c7b44 (commit) from 49e836ebf1c86211c342f320838611fc48e6fa1f (commit) - Log ----------------------------------------------------------------- commit 5d83e22e3cabc3d1e5dc425f492e4459b30a67a0 Author: Andrei Martsinchyk <And...@en...> Date: Wed Jun 30 13:21:11 2010 +0300 Use ereport instead of Assert if sort operation is not defined This error is likely to happen if expression of non-sortable data type is in distinct clause diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 1bbbb75..2cf488c 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1556,7 +1556,11 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first, { int i; - Assert(OidIsValid(sortOp)); + if (!OidIsValid(sortOp)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify an ordering operator"))); + for (i = 0; i < numCols; i++) { commit a1b6404cb6c214e9df075e3d827e8384555c7b44 Author: Andrei Martsinchyk <And...@en...> Date: Wed Jun 30 13:17:03 2010 +0300 Reverted PANIC ereports back to ERROR They were changed for debugging purposes and accidently committed diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 82a7cf8..ae4ed73 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -507,7 +507,7 @@ MemoryContextAlloc(MemoryContext context, Size size) AssertArg(MemoryContextIsValid(context)); if (!AllocSizeIsValid(size)) - elog(PANIC, "invalid memory alloc request size %lu", + elog(ERROR, "invalid memory alloc request size %lu", (unsigned long) size); return (*context->methods->alloc) (context, size); @@ -528,7 +528,7 @@ MemoryContextAllocZero(MemoryContext context, Size size) AssertArg(MemoryContextIsValid(context)); if (!AllocSizeIsValid(size)) - elog(PANIC, "invalid memory alloc request size %lu", + elog(ERROR, "invalid memory alloc request size %lu", (unsigned long) size); ret = (*context->methods->alloc) (context, size); @@ -553,7 +553,7 @@ MemoryContextAllocZeroAligned(MemoryContext context, Size size) AssertArg(MemoryContextIsValid(context)); if (!AllocSizeIsValid(size)) - elog(PANIC, "invalid memory alloc request size %lu", + elog(ERROR, "invalid memory alloc request size %lu", (unsigned long) size); ret = (*context->methods->alloc) (context, size); @@ -617,7 +617,7 @@ repalloc(void *pointer, Size size) AssertArg(MemoryContextIsValid(header->context)); if (!AllocSizeIsValid(size)) - elog(PANIC, "invalid memory alloc request size %lu", + elog(ERROR, "invalid memory alloc request size %lu", (unsigned long) size); return (*header->context->methods->realloc) (header->context, ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 6 +++++- src/backend/utils/mmgr/mcxt.c | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-06-29 19:39:21
|
Project "Postgres-XC". The branch, master has been updated via 49e836ebf1c86211c342f320838611fc48e6fa1f (commit) via 6899314e5a0aad2ded36959ca8bc6e3d7243a586 (commit) from 592295640039744c89a1f319d87fb34072a10efa (commit) - Log ----------------------------------------------------------------- commit 49e836ebf1c86211c342f320838611fc48e6fa1f Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Tue Jun 29 21:32:26 2010 +0200 Add support for ORDER BY adn DISTINCT. This is handled on the Coordinator. It will push down the ORDER BY and merge-sort the sorted input streams from the nodes. It converts from DataRow to tuple format as needed. If one of the SELECT clause expressions is not in the ORDER BY, it appends it to the ORDER BY when pushing it down to the data nodes and leaves it off when returning to the client. With DISTINCT, an ORDER BY will be used and pushed down to the data nodes such that a merge-sort can be done and de-duplication can occur. By Andrei Martsinchyk diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index a86716e..eab1bd0 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -57,6 +57,9 @@ #include "postgres.h" +#ifdef PGXC +#include "funcapi.h" +#endif #include "access/heapam.h" #include "access/sysattr.h" #include "access/tuptoaster.h" @@ -1157,6 +1160,80 @@ slot_deform_tuple(TupleTableSlot *slot, int natts) slot->tts_slow = slow; } +#ifdef PGXC +/* + * slot_deform_datarow + * Extract data from the DataRow message into Datum/isnull arrays. + * We always extract all atributes, as specified in tts_tupleDescriptor, + * because there is no easy way to find random attribute in the DataRow. + */ +static void +slot_deform_datarow(TupleTableSlot *slot) +{ + int attnum = slot->tts_tupleDescriptor->natts; + int i; + int col_count; + char *cur = slot->tts_dataRow; + StringInfo buffer; + uint16 n16; + uint32 n32; + + /* fastpath: exit if values already extracted */ + if (slot->tts_nvalid == attnum) + return; + + Assert(slot->tts_dataRow); + + memcpy(&n16, cur, 2); + cur += 2; + col_count = ntohs(n16); + + if (col_count != attnum) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Tuple does not match the descriptor"))); + + if (slot->tts_attinmeta == NULL) + slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor); + + buffer = makeStringInfo(); + for (i = 0; i < attnum; i++) + { + Form_pg_attribute attr = slot->tts_tupleDescriptor->attrs[i]; + int len; + + /* get size */ + memcpy(&n32, cur, 4); + cur += 4; + len = ntohl(n32); + + /* get data */ + if (len == -1) + { + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + else + { + appendBinaryStringInfo(buffer, cur, len); + cur += len; + + slot->tts_values[i] = InputFunctionCall(slot->tts_attinmeta->attinfuncs + i, + buffer->data, + slot->tts_attinmeta->attioparams[i], + slot->tts_attinmeta->atttypmods[i]); + slot->tts_isnull[i] = false; + + resetStringInfo(buffer); + } + } + pfree(buffer->data); + pfree(buffer); + + slot->tts_nvalid = attnum; +} +#endif + /* * slot_getattr * This function fetches an attribute of the slot's current tuple. @@ -1250,6 +1327,11 @@ slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull) /* * Extract the attribute, along with any preceding attributes. */ +#ifdef PGXC + if (slot->tts_dataRow) + slot_deform_datarow(slot); + else +#endif slot_deform_tuple(slot, attnum); /* @@ -1276,6 +1358,15 @@ slot_getallattrs(TupleTableSlot *slot) if (slot->tts_nvalid == tdesc_natts) return; +#ifdef PGXC + /* Handle the DataRow tuple case */ + if (slot->tts_dataRow) + { + slot_deform_datarow(slot); + return; + } +#endif + /* * otherwise we had better have a physical tuple (tts_nvalid should equal * natts in all virtual-tuple cases) @@ -1319,6 +1410,15 @@ slot_getsomeattrs(TupleTableSlot *slot, int attnum) if (slot->tts_nvalid >= attnum) return; +#ifdef PGXC + /* Handle the DataRow tuple case */ + if (slot->tts_dataRow) + { + slot_deform_datarow(slot); + return; + } +#endif + /* Check for caller error */ if (attnum <= 0 || attnum > slot->tts_tupleDescriptor->natts) elog(ERROR, "invalid attribute number %d", attnum); diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index ee9c04a..99cd92e 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -292,6 +292,19 @@ printtup(TupleTableSlot *slot, DestReceiver *self) int natts = typeinfo->natts; int i; +#ifdef PGXC + /* + * If we are having DataRow-based tuple we do not have to encode attribute + * values, just send over the DataRow message as we received it from the + * data node + */ + if (slot->tts_dataRow) + { + pq_putmessage('D', slot->tts_dataRow, slot->tts_dataLen); + return; + } +#endif + /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) printtup_prepare_info(myState, typeinfo, natts); diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index d641df8..08e35ae 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -35,7 +35,7 @@ #include "parser/parse_relation.h" #ifdef PGXC #include "pgxc/pgxc.h" -#include "pgxc/datanode.h" +#include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/poolmgr.h" #endif @@ -1511,8 +1511,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString) DataNodeCopyFinish( cstate->connections, primary_data_node, - COMBINE_TYPE_NONE, - whereToSendOutput); + COMBINE_TYPE_NONE); pfree(cstate->connections); pfree(cstate->query_buf.data); FreeRelationLocInfo(cstate->rel_loc); @@ -1526,14 +1525,12 @@ DoCopy(const CopyStmt *stmt, const char *queryString) cstate->processed = DataNodeCopyFinish( cstate->connections, primary_data_node, - COMBINE_TYPE_SAME, - whereToSendOutput); + COMBINE_TYPE_SAME); else cstate->processed = DataNodeCopyFinish( cstate->connections, 0, - COMBINE_TYPE_SUM, - whereToSendOutput); + COMBINE_TYPE_SUM); pfree(cstate->connections); pfree(cstate->query_buf.data); FreeRelationLocInfo(cstate->rel_loc); @@ -1775,10 +1772,10 @@ CopyTo(CopyState cstate) #ifdef PGXC if (IS_PGXC_COORDINATOR && !cstate->on_coord) { - DataNodeCopyOut(GetRelationNodes(cstate->rel_loc, NULL, true), - cstate->connections, - whereToSendOutput, - cstate->copy_file); + cstate->processed = DataNodeCopyOut( + GetRelationNodes(cstate->rel_loc, NULL, true), + cstate->connections, + cstate->copy_file); } else { diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index 06142c9..53e424b 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -149,6 +149,12 @@ ExecCreateTupleTable(int tableSize) slot->tts_shouldFreeMin = false; slot->tts_tuple = NULL; slot->tts_tupleDescriptor = NULL; +#ifdef PGXC + slot->tts_shouldFreeRow = false; + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; + slot->tts_attinmeta = NULL; +#endif slot->tts_mcxt = CurrentMemoryContext; slot->tts_buffer = InvalidBuffer; slot->tts_nvalid = 0; @@ -228,6 +234,12 @@ MakeSingleTupleTableSlot(TupleDesc tupdesc) slot->tts_shouldFreeMin = false; slot->tts_tuple = NULL; slot->tts_tupleDescriptor = NULL; +#ifdef PGXC + slot->tts_shouldFreeRow = false; + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; + slot->tts_attinmeta = NULL; +#endif slot->tts_mcxt = CurrentMemoryContext; slot->tts_buffer = InvalidBuffer; slot->tts_nvalid = 0; @@ -334,6 +346,12 @@ ExecSetSlotDescriptor(TupleTableSlot *slot, /* slot to change */ if (slot->tts_tupleDescriptor) ReleaseTupleDesc(slot->tts_tupleDescriptor); +#ifdef PGXC + /* XXX there in no routine to release AttInMetadata instance */ + if (slot->tts_attinmeta) + slot->tts_attinmeta = NULL; +#endif + if (slot->tts_values) pfree(slot->tts_values); if (slot->tts_isnull) @@ -415,6 +433,14 @@ ExecStoreTuple(HeapTuple tuple, heap_freetuple(slot->tts_tuple); if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); +#ifdef PGXC + if (slot->tts_shouldFreeRow) + pfree(slot->tts_dataRow); + + slot->tts_shouldFreeRow = false; + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; +#endif /* * Store the new tuple into the specified slot. @@ -476,6 +502,14 @@ ExecStoreMinimalTuple(MinimalTuple mtup, heap_freetuple(slot->tts_tuple); if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); +#ifdef PGXC + if (slot->tts_shouldFreeRow) + pfree(slot->tts_dataRow); + + slot->tts_shouldFreeRow = false; + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; +#endif /* * Drop the pin on the referenced buffer, if there is one. @@ -504,6 +538,62 @@ ExecStoreMinimalTuple(MinimalTuple mtup, return slot; } +#ifdef PGXC +/* -------------------------------- + * ExecStoreDataRowTuple + * + * Store a buffer in DataRow message format into the slot. + * + * -------------------------------- + */ +TupleTableSlot * +ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot, bool shouldFree) +{ + /* + * sanity checks + */ + Assert(msg != NULL); + Assert(len > 0); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + + /* + * Free any old physical tuple belonging to the slot. + */ + if (slot->tts_shouldFree) + heap_freetuple(slot->tts_tuple); + if (slot->tts_shouldFreeMin) + heap_free_minimal_tuple(slot->tts_mintuple); + if (slot->tts_shouldFreeRow) + pfree(slot->tts_dataRow); + + /* + * Drop the pin on the referenced buffer, if there is one. + */ + if (BufferIsValid(slot->tts_buffer)) + ReleaseBuffer(slot->tts_buffer); + + slot->tts_buffer = InvalidBuffer; + + /* + * Store the new tuple into the specified slot. + */ + slot->tts_isempty = false; + slot->tts_shouldFree = false; + slot->tts_shouldFreeMin = false; + slot->tts_shouldFreeRow = shouldFree; + slot->tts_tuple = NULL; + slot->tts_mintuple = NULL; + slot->tts_dataRow = msg; + slot->tts_dataLen = len; + + /* Mark extracted state invalid */ + slot->tts_nvalid = 0; + + return slot; +} +#endif + /* -------------------------------- * ExecClearTuple * @@ -527,6 +617,14 @@ ExecClearTuple(TupleTableSlot *slot) /* slot in which to store tuple */ heap_freetuple(slot->tts_tuple); if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); +#ifdef PGXC + if (slot->tts_shouldFreeRow) + pfree(slot->tts_dataRow); + + slot->tts_shouldFreeRow = false; + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; +#endif slot->tts_tuple = NULL; slot->tts_mintuple = NULL; @@ -634,7 +732,13 @@ ExecCopySlotTuple(TupleTableSlot *slot) return heap_copytuple(slot->tts_tuple); if (slot->tts_mintuple) return heap_tuple_from_minimal_tuple(slot->tts_mintuple); - +#ifdef PGXC + /* + * Ensure values are extracted from data row to the Datum array + */ + if (slot->tts_dataRow) + slot_getallattrs(slot); +#endif /* * Otherwise we need to build a tuple from the Datum array. */ @@ -667,7 +771,13 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot) return heap_copy_minimal_tuple(slot->tts_mintuple); if (slot->tts_tuple) return minimal_tuple_from_heap_tuple(slot->tts_tuple); - +#ifdef PGXC + /* + * Ensure values are extracted from data row to the Datum array + */ + if (slot->tts_dataRow) + slot_getallattrs(slot); +#endif /* * Otherwise we need to build a tuple from the Datum array. */ @@ -861,6 +971,14 @@ ExecMaterializeSlot(TupleTableSlot *slot) if (!slot->tts_shouldFreeMin) slot->tts_mintuple = NULL; +#ifdef PGXC + if (!slot->tts_shouldFreeRow) + { + slot->tts_dataRow = NULL; + slot->tts_dataLen = -1; + } +#endif + return slot->tts_tuple; } diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index ae537e5..1bbbb75 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -22,8 +22,10 @@ #include "catalog/pg_type.h" #include "lib/stringinfo.h" #include "nodes/nodeFuncs.h" +#include "nodes/nodes.h" #include "nodes/parsenodes.h" #include "optimizer/clauses.h" +#include "optimizer/tlist.h" #include "parser/parse_agg.h" #include "parser/parse_coerce.h" #include "pgxc/locator.h" @@ -123,12 +125,10 @@ typedef struct XCWalkerContext int varno; bool within_or; bool within_not; + List *join_list; /* A list of List*'s, one for each relation. */ } XCWalkerContext; -/* A list of List*'s, one for each relation. */ -List *join_list = NULL; - /* Forbid unsafe SQL statements */ bool StrictStatementChecking = true; @@ -185,12 +185,12 @@ new_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) * Look up the join struct for a particular join */ static PGXC_Join * -find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) +find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2, XCWalkerContext *context) { ListCell *lc; /* return if list is still empty */ - if (join_list == NULL) + if (context->join_list == NULL) return NULL; /* in the PGXC_Join struct, we always sort with relid1 < relid2 */ @@ -209,7 +209,7 @@ find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) * there should be a small number, so we just search linearly, although * long term a hash table would be better. */ - foreach(lc, join_list) + foreach(lc, context->join_list) { PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc); @@ -225,16 +225,16 @@ find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) * Find or create a join between 2 relations */ static PGXC_Join * -find_or_create_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) +find_or_create_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2, XCWalkerContext *context) { PGXC_Join *pgxcjoin; - pgxcjoin = find_pgxc_join(relid1, aliasname1, relid2, aliasname2); + pgxcjoin = find_pgxc_join(relid1, aliasname1, relid2, aliasname2, context); if (pgxcjoin == NULL) { pgxcjoin = new_pgxc_join(relid1, aliasname1, relid2, aliasname2); - join_list = lappend(join_list, pgxcjoin); + context->join_list = lappend(context->join_list, pgxcjoin); } return pgxcjoin; @@ -277,7 +277,7 @@ free_special_relations(Special_Conditions *special_conditions) * frees join_list */ static void -free_join_list(void) +free_join_list(List *join_list) { if (join_list == NULL) return; @@ -368,13 +368,13 @@ get_base_var(Var *var, XCWalkerContext *context) } else if (rte->rtekind == RTE_SUBQUERY) { - /* + /* * Handle views like select * from v1 where col1 = 1 * where col1 is partition column of base relation */ /* the varattno corresponds with the subquery's target list (projections) */ TargetEntry *tle = list_nth(rte->subquery->targetList, var->varattno - 1); /* or varno? */ - + if (!IsA(tle->expr, Var)) return NULL; /* not column based expressoin, return */ else @@ -684,7 +684,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) /* get data struct about these two relations joining */ pgxc_join = find_or_create_pgxc_join(column_base->relid, column_base->relalias, - column_base2->relid, column_base2->relalias); + column_base2->relid, column_base2->relalias, context); if (rel_loc_info1->locatorType == LOCATOR_TYPE_REPLICATED) { @@ -914,7 +914,7 @@ contains_only_pg_catalog (List *rtable) { if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE) return false; - } else if (rte->rtekind == RTE_SUBQUERY && + } else if (rte->rtekind == RTE_SUBQUERY && !contains_only_pg_catalog (rte->subquery->rtable)) return false; } @@ -967,7 +967,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) { /* May be complicated. Before giving up, just check for pg_catalog usage */ if (contains_only_pg_catalog (query->rtable)) - { + { /* just pg_catalog tables */ context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; @@ -1018,7 +1018,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) /* We compare to make sure that the subquery is safe to execute with previous- * we may have multiple ones in the FROM clause. - * We handle the simple case of allowing multiple subqueries in the from clause, + * We handle the simple case of allowing multiple subqueries in the from clause, * but only allow one of them to not contain replicated tables */ if (!from_query_nodes) @@ -1028,20 +1028,20 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) /* ok, safe */ if (!from_query_nodes) from_query_nodes = current_nodes; - } + } else { if (from_query_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED) from_query_nodes = current_nodes; else { - /* Allow if they are both using one node, and the same one */ + /* Allow if they are both using one node, and the same one */ if (!same_single_node (from_query_nodes->nodelist, current_nodes->nodelist)) /* Complicated */ return true; } } - } + } else if (rte->rtekind == RTE_RELATION) { /* Look for pg_catalog tables */ @@ -1049,7 +1049,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) current_usage_type = TABLE_USAGE_TYPE_PGCATALOG; else current_usage_type = TABLE_USAGE_TYPE_USER; - } + } else if (rte->rtekind == RTE_FUNCTION) { /* See if it is a catalog function */ @@ -1095,9 +1095,9 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) return true; /* Examine join conditions, see if each join is single-node safe */ - if (join_list != NULL) + if (context->join_list != NULL) { - foreach(lc, join_list) + foreach(lc, context->join_list) { PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc); @@ -1254,22 +1254,28 @@ static Exec_Nodes * get_plan_nodes(Query *query, bool isRead) { Exec_Nodes *result_nodes; - XCWalkerContext *context = palloc0(sizeof(XCWalkerContext)); - - context->query = query; - context->isRead = isRead; - - context->conditions = (Special_Conditions *) palloc0(sizeof(Special_Conditions)); - context->rtables = lappend(context->rtables, query->rtable); - - join_list = NULL; - - if (get_plan_nodes_walker((Node *) query, context)) + XCWalkerContext context; + + + context.query = query; + context.isRead = isRead; + context.exec_nodes = NULL; + context.conditions = (Special_Conditions *) palloc0(sizeof(Special_Conditions)); + context.rtables = NIL; + context.rtables = lappend(context.rtables, query->rtable); + context.multilevel_join = false; + context.varno = 0; + context.within_or = false; + context.within_not = false; + context.join_list = NIL; + + if (get_plan_nodes_walker((Node *) query, &context)) result_nodes = NULL; else - result_nodes = context->exec_nodes; + result_nodes = context.exec_nodes; - free_special_relations(context->conditions); + free_special_relations(context.conditions); + free_join_list(context.join_list); return result_nodes; } @@ -1304,7 +1310,6 @@ get_plan_nodes_command(Query *query) return NULL; } - free_join_list(); return exec_nodes; } @@ -1345,17 +1350,17 @@ static List * get_simple_aggregates(Query * query) { List *simple_agg_list = NIL; - + /* Check for simple multi-node aggregate */ if (query->hasAggs) { ListCell *lc; int column_pos = 0; - + foreach (lc, query->targetList) { TargetEntry *tle = (TargetEntry *) lfirst(lc); - + if (IsA(tle->expr, Aggref)) { /*PGXC borrowed this code from nodeAgg.c, see ExecInitAgg()*/ @@ -1422,7 +1427,7 @@ get_simple_aggregates(Query * query) get_func_name(finalfn_oid)); } } - + /* resolve actual type of transition state, if polymorphic */ aggcollecttype = aggform->aggcollecttype; @@ -1468,7 +1473,7 @@ get_simple_aggregates(Query * query) get_typlenbyval(aggcollecttype, &simple_agg->transtypeLen, &simple_agg->transtypeByVal); - + /* * initval is potentially null, so don't try to access it as a struct * field. Must do it the hard way with SysCacheGetAttr. @@ -1534,6 +1539,427 @@ get_simple_aggregates(Query * query) /* + * add_sort_column --- utility subroutine for building sort info arrays + * + * We need this routine because the same column might be selected more than + * once as a sort key column; if so, the extra mentions are redundant. + * + * Caller is assumed to have allocated the arrays large enough for the + * max possible number of columns. Return value is the new column count. + * + * PGXC: copied from optimizer/plan/planner.c + */ +static int +add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first, + int numCols, AttrNumber *sortColIdx, + Oid *sortOperators, bool *nullsFirst) +{ + int i; + + Assert(OidIsValid(sortOp)); + + for (i = 0; i < numCols; i++) + { + /* + * Note: we check sortOp because it's conceivable that "ORDER BY foo + * USING <, foo USING <<<" is not redundant, if <<< distinguishes + * values that < considers equal. We need not check nulls_first + * however because a lower-order column with the same sortop but + * opposite nulls direction is redundant. + */ + if (sortColIdx[i] == colIdx && sortOperators[i] == sortOp) + { + /* Already sorting by this col, so extra sort key is useless */ + return numCols; + } + } + + /* Add the column */ + sortColIdx[numCols] = colIdx; + sortOperators[numCols] = sortOp; + nullsFirst[numCols] = nulls_first; + return numCols + 1; +} + +/* + * add_distinct_column - utility subroutine to remove redundant columns, just + * like add_sort_column + */ +static int +add_distinct_column(AttrNumber colIdx, Oid eqOp, int numCols, + AttrNumber *sortColIdx, Oid *eqOperators) +{ + int i; + + Assert(OidIsValid(eqOp)); + + for (i = 0; i < numCols; i++) + { + if (sortColIdx[i] == colIdx && eqOperators[i] == eqOp) + { + /* Already sorting by this col, so extra sort key is useless */ + return numCols; + } + } + + /* Add the column */ + sortColIdx[numCols] = colIdx; + eqOperators[numCols] = eqOp; + return numCols + 1; +} + + +/* + * Reconstruct the step query + */ +static void +reconstruct_step_query(List *rtable, bool has_order_by, List *extra_sort, + RemoteQuery *step) +{ + List *context; + bool useprefix; + List *sub_tlist = step->plan.targetlist; + ListCell *l; + StringInfo buf = makeStringInfo(); + char *sql; + char *cur; + char *sql_from; + + context = deparse_context_for_plan((Node *) step, NULL, rtable, NIL); + useprefix = list_length(rtable) > 1; + + foreach(l, sub_tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + char *exprstr = deparse_expression((Node *) tle->expr, context, + useprefix, false); + + if (buf->len == 0) + { + appendStringInfo(buf, "SELECT "); + if (step->distinct) + appendStringInfo(buf, "DISTINCT "); + } + else + appendStringInfo(buf, ", "); + + appendStringInfoString(buf, exprstr); + } + + /* + * A kind of dummy + * Do not reconstruct remaining query, just search original statement + * for " FROM " and append remainder to the target list we just generated. + * Do not handle the case if " FROM " we found is not a "FROM" keyword, but, + * for example, a part of string constant. + */ + sql = pstrdup(step->sql_statement); /* mutable copy */ + /* string to upper case, for comparing */ + cur = sql; + while (*cur) + { + /* replace whitespace with a space */ + if (isspace((unsigned char) *cur)) + *cur = ' '; + *cur++ = toupper(*cur); + } + + /* find the keyword */ + sql_from = strstr(sql, " FROM "); + if (sql_from) + { + /* the same offset in the original string */ + int offset = sql_from - sql; + /* remove terminating semicolon */ + char *end = strrchr(step->sql_statement, ';'); + *end = '\0'; + + appendStringInfoString(buf, step->sql_statement + offset); + } + + if (extra_sort) + { + foreach(l, extra_sort) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + char *exprstr = deparse_expression((Node *) tle->expr, context, + useprefix, false); + + if (has_order_by) + appendStringInfo(buf, ", "); + else + { + appendStringInfo(buf, " ORDER BY "); + has_order_by = true; + } + + appendStringInfoString(buf, exprstr); + } + } + + /* do not need the copy */ + pfree(sql); + + /* free previous query */ + pfree(step->sql_statement); + /* get a copy of new query */ + step->sql_statement = pstrdup(buf->data); + /* free the query buffer */ + pfree(buf->data); + pfree(buf); +} + + +/* + * Plan to sort step tuples + * PGXC: copied and adopted from optimizer/plan/planner.c + */ +static void +make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) +{ + List *sortcls = query->sortClause; + List *distinctcls = query->distinctClause; + List *sub_tlist = step->plan.targetlist; + SimpleSort *sort; + SimpleDistinct *distinct; + ListCell *l; + int numsortkeys; + int numdistkeys; + AttrNumber *sortColIdx; + AttrNumber *distColIdx; + Oid *sortOperators; + Oid *eqOperators; + bool *nullsFirst; + bool need_reconstruct = false; + /* + * List of target list entries from DISTINCT which are not in the ORDER BY. + * The exressions should be appended to the ORDER BY clause of remote query + */ + List *extra_distincts = NIL; + + Assert(step->sort == NULL); + Assert(step->distinct == NULL); + + /* + * We will need at most list_length(sortcls) sort columns; possibly less + * Also need room for extra distinct expressions if we need to append them + */ + numsortkeys = list_length(sortcls) + list_length(distinctcls); + sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber)); + sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid)); + nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool)); + + numsortkeys = 0; + sort = (SimpleSort *) palloc(sizeof(SimpleSort)); + + if (sortcls) + { + foreach(l, sortcls) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(sortcl, sub_tlist); + + if (tle->resjunk) + need_reconstruct = true; + + /* + * Check for the possibility of duplicate order-by clauses --- the + * parser should have removed 'em, but no point in sorting + * redundantly. + */ + numsortkeys = add_sort_column(tle->resno, sortcl->sortop, + sortcl->nulls_first, + numsortkeys, + sortColIdx, sortOperators, nullsFirst); + } + } + + if (distinctcls) + { + /* + * Validate distinct clause + * We have to sort tuples to filter duplicates, and if ORDER BY clause + * is already present the sort order specified here may be incompatible + * with order needed for distinct. + * + * To be compatible, all expressions from DISTINCT must appear at the + * beginning of ORDER BY list. If list of DISTINCT expressions is longer + * then ORDER BY we can make ORDER BY compatible we can append remaining + * expressions from DISTINCT to ORDER BY. Obviously ORDER BY must not + * contain expressions not from the DISTINCT list in this case. + * + * For validation purposes we use column indexes (AttrNumber) to + * identify expressions. May be this is not enough and we should revisit + * the algorithm. + * + * We validate compatibility as follow: + * 1. Make working copy of DISTINCT + * 1a. Remove possible duplicates when copying: do not add expression + * 2. If order by is empty they are already compatible, skip 3 + * 3. Iterate over ORDER BY items + * 3a. If the item is in the working copy delete it from the working + * list. If working list is empty after deletion DISTINCT and + * ORDER BY are compatible, so break the loop. If working list is + * not empty continue iterating + * 3b. ORDER BY clause may contain duplicates. So if we can not found + * expression in the remainder of DISTINCT, probably it has already + * been removed because of duplicate ORDER BY entry. Check original + * DISTINCT clause, if expression is there continue iterating. + * 3c. DISTINCT and ORDER BY are not compatible, emit error + * 4. DISTINCT and ORDER BY are compatible, if we have remaining items + * in the working copy we should append it to the order by list + */ + /* + * Create the list of unique DISTINCT clause expressions + */ + foreach(l, distinctcls) + { + SortGroupClause *distinctcl = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(distinctcl, sub_tlist); + bool found = false; + + if (extra_distincts) + { + ListCell *xl; + + foreach(xl, extra_distincts) + { + TargetEntry *xtle = (TargetEntry *) lfirst(xl); + if (xtle->resno == tle->resno) + { + found = true; + break; + } + } + } + + if (!found) + extra_distincts = lappend(extra_distincts, tle); + } + + if (sortcls) + { + foreach(l, sortcls) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(sortcl, sub_tlist); + bool found = false; + ListCell *xl; + ListCell *prev = NULL; + + /* Search for the expression in the DISTINCT clause */ + foreach(xl, extra_distincts) + { + TargetEntry *xtle = (TargetEntry *) lfirst(xl); + if (xtle->resno == tle->resno) + { + extra_distincts = list_delete_cell(extra_distincts, xl, + prev); + found = true; + break; + } + prev = xl; + } + + /* Probably we've done */ + if (found && list_length(extra_distincts) == 0) + break; + + /* Ensure sort expression is not a duplicate */ + if (!found) + { + foreach(xl, distinctcls) + { + SortGroupClause *xcl = (SortGroupClause *) lfirst(xl); + TargetEntry *xtle = get_sortgroupclause_tle(xcl, sub_tlist); + if (xtle->resno == tle->resno) + { + /* it is a duplicate then */ + found = true; + break; + } + } + } + + /* Give up, we do not support it */ + if (!found) + { + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Such combination of ORDER BY and DISTINCT is not yet supported")))); + } + } + } + /* need to append to the ORDER BY */ + if (list_length(extra_distincts) > 0) + need_reconstruct = true; + + /* + * End of validation, expression to append to ORDER BY are in the + * extra_distincts list + */ + + distinct = (SimpleDistinct *) palloc(sizeof(SimpleDistinct)); + + /* + * We will need at most list_length(distinctcls) sort columns + */ + numdistkeys = list_length(distinctcls); + distColIdx = (AttrNumber *) palloc(numdistkeys * sizeof(AttrNumber)); + eqOperators = (Oid *) palloc(numdistkeys * sizeof(Oid)); + + numdistkeys = 0; + + foreach(l, distinctcls) + { + SortGroupClause *distinctcl = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(distinctcl, sub_tlist); + + /* + * Check for the possibility of duplicate order-by clauses --- the + * parser should have removed 'em, but no point in sorting + * redundantly. + */ + numdistkeys = add_distinct_column(tle->resno, + distinctcl->eqop, + numdistkeys, + distColIdx, + eqOperators); + /* append also extra sort operator, if not already there */ + numsortkeys = add_sort_column(tle->resno, + distinctcl->sortop, + distinctcl->nulls_first, + numsortkeys, + sortColIdx, + sortOperators, + nullsFirst); + } + + Assert(numdistkeys > 0); + + distinct->numCols = numdistkeys; + distinct->uniqColIdx = distColIdx; + distinct->eqOperators = eqOperators; + + step->distinct = distinct; + } + + + Assert(numsortkeys > 0); + + sort->numCols = numsortkeys; + sort->sortColIdx = sortColIdx; + sort->sortOperators = sortOperators; + sort->nullsFirst = nullsFirst; + + step->sort = sort; + + if (need_reconstruct) + reconstruct_step_query(query->rtable, sortcls != NULL, extra_distincts, + step); +} + +/* * Build up a QueryPlan to execute on. * * For the prototype, there will only be one step, @@ -1543,17 +1969,16 @@ Query_Plan * GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) { Query_Plan *query_plan = palloc(sizeof(Query_Plan)); - Query_Step *query_step = palloc(sizeof(Query_Step)); + RemoteQuery *query_step = makeNode(RemoteQuery); Query *query; - - query_plan->force_autocommit = false; - query_step->sql_statement = (char *) palloc(strlen(sql_statement) + 1); strcpy(query_step->sql_statement, sql_statement); query_step->exec_nodes = NULL; query_step->combine_type = COMBINE_TYPE_NONE; query_step->simple_aggregates = NULL; + query_step->read_only = false; + query_step->force_autocommit = false; query_plan->query_step_list = lappend(NULL, query_step); @@ -1565,11 +1990,16 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) switch (nodeTag(parsetree)) { case T_SelectStmt: + /* Optimize multi-node handling */ + query_step->read_only = true; + /* fallthru */ case T_InsertStmt: case T_UpdateStmt: case T_DeleteStmt: /* just use first one in querytree_list */ query = (Query *) linitial(querytree_list); + /* should copy instead ? */ + query_step->plan.targetlist = query->targetList; /* Perform some checks to make sure we can support the statement */ if (nodeTag(parsetree) == T_SelectStmt) @@ -1633,6 +2063,12 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) } /* + * Add sortring to the step + */ + if (query->sortClause || query->distinctClause) + make_simple_sort_from_sortclauses(query, query_step); + + /* * PG-XC cannot yet support some variations of SQL statements. * We perform some checks to at least catch common cases */ @@ -1658,15 +2094,6 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("Multi-node LIMIT not yet supported")))); - if (query->sortClause && StrictSelectChecking) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Multi-node ORDER BY not yet supported")))); - /* PGXCTODO - check if first column partitioning column */ - if (query->distinctClause) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Multi-node DISTINCT`not yet supported")))); } } break; @@ -1686,7 +2113,7 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) case T_DropdbStmt: case T_VacuumStmt: query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; - query_plan->force_autocommit = true; + query_step->force_autocommit = true; break; case T_DropPropertyStmt: @@ -1864,7 +2291,7 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) * Free Query_Step struct */ static void -free_query_step(Query_Step *query_step) +free_query_step(RemoteQuery *query_step) { if (query_step == NULL) return; @@ -1894,7 +2321,7 @@ FreeQueryPlan(Query_Plan *query_plan) return; foreach(item, query_plan->query_step_list) - free_query_step((Query_Step *) lfirst(item)); + free_query_step((RemoteQuery *) lfirst(item)); pfree(query_plan->query_step_list); pfree(query_plan); diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile index 7143af5..e875303 100644 --- a/src/backend/pgxc/pool/Makefile +++ b/src/backend/pgxc/pool/Makefile @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/pool top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = combiner.o datanode.o poolmgr.o poolcomm.o +OBJS = datanode.o execRemote.o poolmgr.o poolcomm.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/pool/combiner.c b/src/backend/pgxc/pool/combiner.c deleted file mode 100644 index 53e5dfb..0000000 --- a/src/backend/pgxc/pool/combiner.c +++ /dev/null @@ -1,652 +0,0 @@ -/*------------------------------------------------------------------------- - * - * combiner.c - * - * Combine responses from multiple Data Nodes - * - * - * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group - * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation - * - * IDENTIFICATION - * $$ - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" -#include "pgxc/combiner.h" -#include "pgxc/planner.h" -#include "catalog/pg_type.h" -#include "libpq/libpq.h" -#include "libpq/pqformat.h" -#include "utils/builtins.h" -#include "utils/datum.h" - - -/* - * Create a structure to store parameters needed to combine responses from - * multiple connections as well as state information - */ -ResponseCombiner -CreateResponseCombiner(int node_count, CombineType combine_type, - CommandDest dest) -{ - ResponseCombiner combiner; - - /* ResponseComber is a typedef for pointer to ResponseCombinerData */ - combiner = (ResponseCombiner) palloc(sizeof(ResponseCombinerData)); - if (combiner == NULL) - { - /* Out of memory */ - return combiner; - } - - combiner->node_count = node_count; - combiner->combine_type = combine_type; - combiner->dest = dest; - combiner->command_complete_count = 0; - combiner->row_count = 0; - combiner->request_type = REQUEST_TYPE_NOT_DEFINED; - combiner->description_count = 0; - combiner->copy_in_count = 0; - combiner->copy_out_count = 0; - combiner->inErrorState = false; - combiner->initAggregates = true; - combiner->simple_aggregates = NULL; - combiner->copy_file = NULL; - - return combiner; -} - -/* - * Parse out row count from the command status response and convert it to integer - */ -static int -parse_row_count(const char *message, size_t len, int *rowcount) -{ - int digits = 0; - int pos; - - *rowcount = 0; - /* skip \0 string terminator */ - for (pos = 0; pos < len - 1; pos++) - { - if (message[pos] >= '0' && message[pos] <= '9') - { - *rowcount = *rowcount * 10 + message[pos] - '0'; - digits++; - } - else - { - *rowcount = 0; - digits = 0; - } - } - return digits; -} - -/* - * Extract a transition value from data row. Invoke the Input Function - * associated with the transition data type to represent value as a Datum. - * Output parameters value and val_null, receive extracted value and indicate - * whether it is null. - */ -static void -parse_aggregate_value(SimpleAgg *simple_agg, char *col_data, size_t datalen, Datum *value, bool *val_null) -{ - /* Check NULL */ - if (datalen == -1) - { - *value = (Datum) 0; - *val_null = true; - } - else - { - resetStringInfo(&simple_agg->valuebuf); - appendBinaryStringInfo(&simple_agg->valuebuf, col_data, datalen); - *value = InputFunctionCall(&simple_agg->arginputfn, simple_agg->valuebuf.data, simple_agg->argioparam, -1); - *val_null = false; - } -} - -/* - * Initialize the collection value, when agregation is first set up, or for a - * new group (grouping support is not implemented yet) - */ -static void -initialize_collect_aggregates(SimpleAgg *simple_agg) -{ - if (simple_agg->initValueIsNull) - simple_agg->collectValue = simple_agg->initValue; - else - simple_agg->collectValue = datumCopy(simple_agg->initValue, - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - simple_agg->noCollectValue = simple_agg->initValueIsNull; - simple_agg->collectValueNull = simple_agg->initValueIsNull; -} - -/* - * Finalize the aggregate after current group or entire relation is processed - * (grouping support is not implemented yet) - */ -static void -finalize_collect_aggregates(SimpleAgg *simple_agg, Datum *resultVal, bool *resultIsNull) -{ - /* - * Apply the agg's finalfn if one is provided, else return collectValue. - */ - if (OidIsValid(simple_agg->finalfn_oid)) - { - FunctionCallInfoData fcinfo; - - InitFunctionCallInfoData(fcinfo, &(simple_agg->finalfn), 1, - (void *) simple_agg, NULL); - fcinfo.arg[0] = simple_agg->collectValue; - fcinfo.argnull[0] = simple_agg->collectValueNull; - if (fcinfo.flinfo->fn_strict && simple_agg->collectValueNull) - { - /* don't call a strict function with NULL inputs */ - *resultVal = (Datum) 0; - *resultIsNull = true; - } - else - { - *resultVal = FunctionCallInvoke(&fcinfo); - *resultIsNull = fcinfo.isnull; - } - } - else - { - *resultVal = simple_agg->collectValue; - *resultIsNull = simple_agg->collectValueNull; - } -} - -/* - * Given new input value(s), advance the transition function of an aggregate. - * - * The new values (and null flags) have been preloaded into argument positions - * 1 and up in fcinfo, so that we needn't copy them again to pass to the - * collection function. No other fields of fcinfo are assumed valid. - * - * It doesn't matter which memory context this is called in. - */ -static void -advance_collect_function(SimpleAgg *simple_agg, FunctionCallInfoData *fcinfo) -{ - Datum newVal; - - if (simple_agg->transfn.fn_strict) - { - /* - * For a strict transfn, nothing happens when there's a NULL input; we - * just keep the prior transValue. - */ - if (fcinfo->argnull[1]) - return; - if (simple_agg->noCollectValue) - { - /* - * result has not been initialized - * We must copy the datum into result if it is pass-by-ref. We - * do not need to pfree the old result, since it's NULL. - */ - simple_agg->collectValue = datumCopy(fcinfo->arg[1], - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - simple_agg->collectValueNull = false; - simple_agg->noCollectValue = false; - return; - } - if (simple_agg->collectValueNull) - { - /* - * Don't call a strict function with NULL inputs. Note it is - * possible to get here despite the above tests, if the transfn is - * strict *and* returned a NULL on a prior cycle. If that happens - * we will propagate the NULL all the way to the end. - */ - return; - } - } - - /* - * OK to call the transition function - */ - InitFunctionCallInfoData(*fcinfo, &(simple_agg->transfn), 2, (void *) simple_agg, NULL); - fcinfo->arg[0] = simple_agg->collectValue; - fcinfo->argnull[0] = simple_agg->collectValueNull; - newVal = FunctionCallInvoke(fcinfo); - - /* - * If pass-by-ref datatype, must copy the new value into aggcontext and - * pfree the prior transValue. But if transfn returned a pointer to its - * first input, we don't need to do anything. - */ - if (!simple_agg->transtypeByVal && - DatumGetPointer(newVal) != DatumGetPointer(simple_agg->collectValue)) - { - if (!fcinfo->isnull) - { - newVal = datumCopy(newVal, - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - } - if (!simple_agg->collectValueNull) - pfree(DatumGetPointer(simple_agg->collectValue)); - } - - simple_agg->collectValue = newVal; - simple_agg->collectValueNull = fcinfo->isnull; -} - -/* - * Handle response message and update combiner's state. - * This function contains main combiner logic - */ -int -CombineResponse(ResponseCombiner combiner, char msg_type, char *msg_body, size_t len) -{ - int digits = 0; - - /* Ignore anything if we have encountered error */ - if (combiner->inErrorState) - return EOF; - - switch (msg_type) - { - case 'c': /* CopyOutCommandComplete */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_COPY_OUT; - if (combiner->request_type != REQUEST_TYPE_COPY_OUT) - /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - /* Just do nothing, close message is managed by the coordinator */ - combiner->copy_out_count++; - break; - case 'C': /* CommandComplete */ - /* - * If we did not receive description we are having rowcount or OK - * response - */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_COMMAND; - /* Extract rowcount */ - if (combiner->combine_type != COMBINE_TYPE_NONE) - { - int rowcount; - digits = parse_row_count(msg_body, len, &rowcount); - if (digits > 0) - { - /* Replicated write, make sure they are the same */ - if (combiner->combine_type == COMBINE_TYPE_SAME) - { - if (combiner->command_complete_count) - { - if (rowcount != combiner->row_count) - /* There is a consistency issue in the database with the replicated table */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Write to replicated table returned different results from the data nodes"))); - } - else - /* first result */ - combiner->row_count = rowcount; - } - else - combiner->row_count += rowcount; - } - else - combiner->combine_type = COMBINE_TYPE_NONE; - } - if (++combiner->command_complete_count == combiner->node_count) - { - - if (combiner->simple_aggregates - /* - * Aggregates has not been initialized - that means - * no rows received from data nodes, nothing to send - * It is possible if HAVING clause is present - */ - && !combiner->initAggregates) - { - /* Build up and send a datarow with aggregates */ - StringInfo dataRowBuffer = makeStringInfo(); - ListCell *lc; - - /* Number of fields */ - pq_sendint(dataRowBuffer, list_length(combiner->simple_aggregates), 2); - - foreach (lc, combiner->simple_aggregates) - { - SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc); - Datum resultVal; - bool resultIsNull; - - finalize_collect_aggregates(simple_agg, &resultVal, &resultIsNull); - /* Aggregation result */ - if (resultIsNull) - { - pq_sendint(dataRowBuffer, -1, 4); - } - else - { - char *text = OutputFunctionCall(&simple_agg->resoutputfn, resultVal); - size_t len = strlen(text); - pq_sendint(dataRowBuffer, len, 4); - pq_sendtext(dataRowBuffer, text, len); - } - } - pq_putmessage('D', dataRowBuffer->data, dataRowBuffer->len); - pfree(dataRowBuffer->data); - pfree(dataRowBuffer); - } - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - { - if (combiner->combine_type == COMBINE_TYPE_NONE) - { - pq_putmessage(msg_type, msg_body, len); - } - else - { - char command_complete_buffer[256]; - - /* Truncate msg_body to get base string */ - msg_body[len - digits - 1] = '\0'; - len = sprintf(command_complete_buffer, "%s%d", msg_body, combiner->row_count) + 1; - pq_putmessage(msg_type, command_complete_buffer, len); - } - } - } - break; - case 'T': /* RowDescription */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_QUERY; - if (combiner->request_type != REQUEST_TYPE_QUERY) - { - /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - } - /* Proxy first */ - if (combiner->description_count++ == 0) - { - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - pq_putmessage(msg_type, msg_body, len); - } - break; - case 'S': /* ParameterStatus (SET command) */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_QUERY; - if (combiner->request_type != REQUEST_TYPE_QUERY) - { - /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - } - /* Proxy last */ - if (++combiner->description_count == combiner->node_count) - { - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - pq_putmessage(msg_type, msg_body, len); - } - break; - case 'G': /* CopyInResponse */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_COPY_IN; - if (combiner->request_type != REQUEST_TYPE_COPY_IN) - { - /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - } - /* Proxy first */ - if (combiner->copy_in_count++ == 0) - { - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - pq_putmessage(msg_type, msg_body, len); - } - break; - case 'H': /* CopyOutResponse */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_COPY_OUT; - if (combiner->request_type != REQUEST_TYPE_COPY_OUT) - { - /* Inconsistent responses */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - } - /* - * The normal PG code will output an H message when it runs in the - * coordinator, so do not proxy message here, just count it. - */ - combiner->copy_out_count++; - break; - case 'd': /* CopyOutDataRow */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - combiner->request_type = REQUEST_TYPE_COPY_OUT; - - /* Inconsistent responses */ - if (combiner->request_type != REQUEST_TYPE_COPY_OUT) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - - /* If there is a copy file, data has to be sent to the local file */ - if (combiner->copy_file) - { - /* write data to the copy file */ - char *data_row; - data_row = (char *) palloc0(len); - memcpy(data_row, msg_body, len); - - fwrite(data_row, 1, len, combiner->copy_file); - break; - } - /* - * In this case data is sent back to the client - */ - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - { - StringInfo data_buffer; - - data_buffer = makeStringInfo(); - - pq_sendtext(data_buffer, msg_body, len); - pq_putmessage(msg_type, - data_buffer->data, - data_buffer->len); - - pfree(data_buffer->data); - pfree(data_buffer); - } - break; - case 'D': /* DataRow */ - if (!combiner->simple_aggregates) - { - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - pq_putmessage(msg_type, msg_body, len); - } - else - { - ListCell *lc; - char **col_values; - int *col_value_len; - uint16 col_count; - int i, cur = 0; - - /* Get values from the data row into array to speed up access */ - memcpy(&col_count, msg_body, 2); - col_count = ntohs(col_count); - cur += 2; - - col_values = (char **) palloc0(col_count * sizeof(char *)); - col_value_len = (int *) palloc0(col_count * sizeof(int)); - for (i = 0; i < col_count; i++) - { - int n32; - - memcpy(&n32, msg_body + cur, 4); - col_value_len[i] = ntohl(n32); - cur += 4; - - if (col_value_len[i] != -1) - { - col_values[i] = msg_body + cur; - cur += col_value_len[i]; - } - } - - if (combiner->initAggregates) - { - foreach (lc, combiner->simple_aggregates) - initialize_collect_aggregates((SimpleAgg *) lfirst(lc)); - - combiner->initAggregates = false; - } - - foreach (lc, combiner->simple_aggregates) - { - SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc); - FunctionCallInfoData fcinfo; - - parse_aggregate_value(simple_agg, - col_values[simple_agg->column_pos], - col_value_len[simple_agg->column_pos], - fcinfo.arg + 1, - fcinfo.argnull + 1); - - advance_collect_function(simple_agg, &fcinfo); - } - pfree(col_values); - pfree(col_value_len); - } - break; - case 'E': /* ErrorResponse */ - combiner->inErrorState = true; - /* fallthru */ - case 'A': /* NotificationResponse */ - case 'N': /* NoticeResponse */ - /* Proxy error message back if specified, - * or if doing internal primary copy - */ - if (combiner->dest == DestRemote - || combiner->dest == DestRemoteExecute) - pq_putmessage(msg_type, msg_body, len); - break; - case 'I': /* EmptyQuery */ - default: - /* Unexpected message */ - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("Unexpected response from the data nodes"))); - } - return 0; -} - -/* - * Examine the specified combiner state and determine if command was completed - * successfully - */ -static bool -validate_combiner(ResponseCombiner combiner) -{ - /* There was error message while combining */ - if (combiner->inErrorState) - return false; - /* Check if state is defined */ - if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) - return false; - /* Check all nodes completed */ - if ((combiner->request_type == REQUEST_TYPE_COMMAND - || combiner->request_type == REQUEST_TYPE_QUERY) - && combiner->command_complete_count != combiner->node_count) - return false; - - /* Check count of description responses */ - if (combiner->request_type == REQUEST_TYPE_QUERY - && combiner->description_count != combiner->node_count) - return false; - - /* Check count of copy-in responses */ - if (combiner->request_type == REQUEST_TYPE_COPY_IN - && combiner->copy_in_count != combiner->node_count) - return false; - - /* Check count of copy-out responses */ - if (combiner->request_type == REQUEST_TYPE_COPY_OUT - && combiner->copy_out_count != combiner->node_count) - return false; - - /* Add other checks here as needed */ - - /* All is good if we are here */ - return true; -} - -/* - * Validate combiner and release storage freeing allocated memory - */ -bool -ValidateAndCloseCombiner(ResponseCombiner combiner) -{ - bool valid = validate_combiner(combiner); - - pfree(combiner); - - return valid; -} - -/* - * Validate combiner and reset storage - */ -bool -ValidateAndResetCombiner(ResponseCombiner combiner) -{ - bool valid = validate_combiner(combiner); - - combiner->command_complete_count = 0; - combiner->row_count = 0; - combiner->request_type = REQUEST_TYPE_NOT_DEFINED; - combiner->description_count = 0; - combiner->copy_in_count = 0; - combiner->copy_out_count = 0; - combiner->inErrorState = false; - combiner->simple_aggregates = NULL; - combiner->copy_file = NULL; - - return valid; -} - -/* - * Close combiner and free allocated memory, if it is not needed - */ -void -CloseCombiner(ResponseCombiner combiner) -{ - if (combiner) - pfree(combiner); -} - -/* - * Assign combiner aggregates - */ -void -AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates) -{ - combiner->simple_aggregates = simple_aggregates; -} diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c index 6a1aba8..517b1e4 100644 --- a/src/backend/pgxc/pool/datanode.c +++ b/src/backend/pgxc/pool/datanode.c @@ -15,6 +15,7 @@ *------------------------------------------------------------------------- */ +#include "postgres.h" #include <sys/select.h> #include <sys/time.h> #include <sys/types.h> @@ -22,166 +23,33 @@ #include <string.h> #include <unistd.h> #include <errno.h> -#include "pgxc/poolmgr.h" #include "access/gtm.h" #include "access/transam.h" #include "access/xact.h" -#include "postgres.h" -#include "utils/snapmgr.h" -#include "pgxc/pgxc.h" #include "gtm/gtm_c.h" #include "pgxc/datanode.h" #include "pgxc/locator.h" -#include "../interfaces/libpq/libpq-fe.h" +#include "pgxc/pgxc.h" +#include "pgxc/poolmgr.h" +#include "tcop/dest.h" #include "utils/elog.h" #include "utils/memutils.h" - +#include "utils/snapmgr.h" +#include "../interfaces/libpq/libpq-fe.h" #define NO_SOCKET -1 -/* - * Buffer size does not affect performance significantly, just do not allow - * connection buffer grows infinitely - */ -#define COPY_BUFFER_SIZE 8192 -#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024 - static int node_count = 0; static DataNodeHandle *handles = NULL; -static bool autocommit = true; -static DataNodeHandle **write_node_list = NULL; -static int write_node_count = 0; -static DataNodeHandle **get_handles(List *nodelist); -static int get_transaction_nodes(DataNodeHandle **connections); -static void release_handles(void); - -static void data_node_init(DataNodeHandle *handle, int sock); +static void data_node_init(DataNodeHandle *handle, int sock, int nodenum); static void data_node_free(DataNodeHandle *handle); -static int data_node_begin(int conn_count, DataNodeHandle **connections, CommandDest dest, GlobalTransactionId gxid); -static int data_node_commit(int conn_count, DataNodeHandle **connections, CommandDest dest); -static int data_node_rollback(int conn_count, DataNodeHandle **connections, CommandDest dest); - -static int ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle *handle); -static int ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle *handle); - -static int data_node_send_query(DataNodeHandle *handle, const char *query); -static int data_node_send_gxid(DataNodeHandle *handle, GlobalTransactionId gxid); -static int data_node_send_snapshot(DataNodeHandle *handle, Snapshot snapshot); - -static void add_error_message(DataNodeHandle *handle, const char *message); - -static int data_node_read_data(DataNodeHandle *conn); -static int handle_response(DataNodeHandle *conn, ResponseCombiner combiner); - -static int get_int(DataNodeHandle *conn, size_t len, int *out); -static int get_char(DataNodeHandle *conn, char *out); - -static void clear_write_node_list(); - -#define MAX_STATEMENTS_PER_TRAN 10 - -/* Variables to collect statistics */ -static int total_transactions = 0; -static int total_statements = 0; -static int total_autocommit = 0; -static int nonautocommit_2pc = 0; -static int autocommit_2pc = 0; -static int current_tran_statements = 0; -static int *statements_per_transaction = NULL; -static int *nodes_per_transaction = NULL; - -/* - * statistics collection: count a statement - */ -static void -stat_statement() -{ - total_statements++; - current_tran_statements++; -} - -/* - * To collect statistics: count a transaction - */ -static void -stat_transaction(int node_count) -{ - total_transactions++; - if (autocommit) - total_autocommit++; - - if (!statements_per_transaction) - { - statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); - memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); - } - - if (current_tran_statements > MAX_STATEMENTS_PER_TRAN) - statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++; - else - statements_per_transaction[current_tran_statements]++; - - current_tran_statements = 0; - if (node_count > 0 && node_count <= NumDataNodes) - { - if (!nodes_per_transaction) - { - nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int)); - memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int)); - } - nodes_per_transaction[node_count - 1]++; - } -} - - -/* - * To collect statistics: count a two-phase commit on nodes - */ -static void -stat_2pc(void) -{ - if (autocommit) - autocommit_2pc++; - else - nonautocommit_2pc++; -} +static int get_int(DataNodeHandle * conn, size_t len, int *out); +static int get_char(DataNodeHandle * conn, char *out); /* - * Output collected statistics to the log - */ -static void -stat_log(void) -{ - elog(DEBUG1, "Total Transactions: %d Tota... [truncated message content] |
From: mason_s <ma...@us...> - 2010-06-28 15:18:13
|
Project "Postgres-XC". The branch, master has been updated via 592295640039744c89a1f319d87fb34072a10efa (commit) from a32e437055fe9f9162651fd6edd811b77f443881 (commit) - Log ----------------------------------------------------------------- commit 592295640039744c89a1f319d87fb34072a10efa Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Mon Jun 28 17:08:20 2010 +0200 Allow rules to be created, provided that they do not use NOTIFY, which is not yet supported. Note that using rules is a bit unsafe. We currently end up passing down the original statement (or something close to it) to the data nodes, but based on the modifications to the rewritten tree from the rules. It is possible to do something that violates the distribution rules of the system. For example, on an update, one could insert into a table that is hash distributed, but populate it with a value that violates this. diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index acd9f97..2608a3f 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1613,6 +1613,13 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString, bool has_old, has_new; +#ifdef PGXC + if(IsA(action, NotifyStmt)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("Rule may not use NOTIFY, it is not yet supported"))); + +#endif /* * Since outer ParseState isn't parent of inner, have to pass down * the query text by hand. diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 7f45fb7..a4565e7 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1757,6 +1757,7 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) case T_RemoveOpClassStmt: case T_RemoveOpFamilyStmt: case T_RenameStmt: + case T_RuleStmt: case T_TruncateStmt: case T_VariableSetStmt: case T_ViewStmt: @@ -1841,7 +1842,6 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) case T_LoadStmt: case T_NotifyStmt: case T_PrepareStmt: - case T_RuleStmt: case T_UnlistenStmt: /* fall through */ default: ----------------------------------------------------------------------- Summary of changes: src/backend/parser/parse_utilcmd.c | 7 +++++++ src/backend/pgxc/plan/planner.c | 2 +- 2 files changed, 8 insertions(+), 1 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-06-27 19:10:31
|
Project "Postgres-XC". The branch, master has been updated via a32e437055fe9f9162651fd6edd811b77f443881 (commit) from 75127cbf9ff834aabc4e4f39f2628f7a9646a6ea (commit) - Log ----------------------------------------------------------------- commit a32e437055fe9f9162651fd6edd811b77f443881 Author: Mason S <masonsharp@mason-sharps-macbook.local> Date: Sun Jun 27 21:07:04 2010 +0200 Handle more types of queries to determine whether or not they can be safely executed within the current XC architecture. Checking is more thorough now, including detection of negative cases. Also fixed a bug with OR. In particular these changes handle subqueries in conditions and detect correlated joins that can be done on nodes when the data is colocated. diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 0fb4a2b..7f45fb7 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -49,6 +49,16 @@ typedef struct long constant; /* assume long PGXCTODO - should be Datum */ } Literal_Comparison; +/* Parent-Child joins for relations being joined on + * their respective hash distribuion columns + */ +typedef struct +{ + RelationLocInfo *rel_loc_info1; + RelationLocInfo *rel_loc_info2; + OpExpr *opexpr; +} Parent_Child_Join; + /* * This struct helps us detect special conditions to determine what nodes * to execute on. @@ -56,7 +66,7 @@ typedef struct typedef struct { List *partitioned_literal_comps; /* List of Literal_Comparison */ - List *partitioned_parent_child; + List *partitioned_parent_child; /* List of Parent_Child_Join */ List *replicated_joins; /* @@ -96,6 +106,26 @@ typedef struct ColumnBase char *colname; } ColumnBase; +/* Used for looking for XC-safe queries + * + * rtables is a pointer to List, each item of which is + * the rtable for the particular query. This way we can use + * varlevelsup to resolve Vars in nested queries + */ +typedef struct XCWalkerContext +{ + Query *query; + bool isRead; + Exec_Nodes *exec_nodes; /* resulting execution nodes */ + Special_Conditions *conditions; + bool multilevel_join; + List *rtables; /* a pointer to a list of rtables */ + int varno; + bool within_or; + bool within_not; +} XCWalkerContext; + + /* A list of List*'s, one for each relation. */ List *join_list = NULL; @@ -105,6 +135,12 @@ bool StrictStatementChecking = true; /* Forbid multi-node SELECT statements with an ORDER BY clause */ bool StrictSelectChecking = false; + +static Exec_Nodes *get_plan_nodes(Query *query, bool isRead); +static bool get_plan_nodes_walker(Node *query_node, XCWalkerContext *context); +static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context); + + /* * True if both lists contain only one node and are the same */ @@ -113,7 +149,7 @@ same_single_node (List *nodelist1, List *nodelist2) { return nodelist1 && list_length(nodelist1) == 1 && nodelist2 && list_length(nodelist2) == 1 - && linitial_int(nodelist1) != linitial_int(nodelist2); + && linitial_int(nodelist1) == linitial_int(nodelist2); } /* @@ -234,6 +270,7 @@ free_special_relations(Special_Conditions *special_conditions) list_free(special_conditions->replicated_joins); pfree(special_conditions); + special_conditions = NULL; } /* @@ -246,6 +283,7 @@ free_join_list(void) return; list_free_deep(join_list); + join_list = NULL; } /* @@ -287,18 +325,28 @@ get_numeric_constant(Expr *expr) * This is required because a RangeTblEntry may actually be another * type, like a join, and we need to then look at the joinaliasvars * to determine what the base table and column really is. + * + * rtables is a List of rtable Lists. */ static ColumnBase* -get_base_var(Var *var, List *rtables) +get_base_var(Var *var, XCWalkerContext *context) { RangeTblEntry *rte; + List *col_rtable; /* Skip system attributes */ if (!AttrNumberIsForUserDefinedAttr(var->varattno)) return NULL; - /* get the RangeTableEntry */ - rte = list_nth(rtables, var->varno - 1); + /* + * Get the RangeTableEntry + * We take nested subqueries into account first, + * we may need to look further up the query tree. + * The most recent rtable is at the end of the list; top most one is first. + */ + Assert (list_length(context->rtables) - var->varlevelsup > 0); + col_rtable = list_nth(context->rtables, (list_length(context->rtables) - var->varlevelsup) - 1); + rte = list_nth(col_rtable, var->varno - 1); if (rte->rtekind == RTE_RELATION) { @@ -316,8 +364,7 @@ get_base_var(Var *var, List *rtables) Var *colvar = list_nth(rte->joinaliasvars, var->varattno - 1); /* continue resolving recursively */ - return get_base_var(colvar, rtables); - //may need to set this, toocolumn_base->relalias = rte->eref->aliasname; + return get_base_var(colvar, context); } else if (rte->rtekind == RTE_SUBQUERY) { @@ -332,10 +379,16 @@ get_base_var(Var *var, List *rtables) return NULL; /* not column based expressoin, return */ else { + ColumnBase *base; Var *colvar = (Var *) tle->expr; /* continue resolving recursively */ - return get_base_var(colvar, rte->subquery->rtable); + /* push onto rtables list */ + context->rtables = lappend(context->rtables, rte->subquery->rtable); + base = get_base_var(colvar, context); + /* pop from rtables list */ + context->rtables = list_delete_ptr(context->rtables, rte->subquery->rtable); + return base; } } @@ -403,7 +456,7 @@ get_plan_nodes_insert(Query *query) if (!IsA(tle->expr, Const)) { - eval_expr = eval_const_expressions(NULL, (Node *) tle->expr); + eval_expr = (Expr *) eval_const_expressions(NULL, (Node *) tle->expr); checkexpr = get_numeric_constant(eval_expr); } @@ -440,7 +493,7 @@ get_plan_nodes_insert(Query *query) /* - * examine_conditions + * examine_conditions_walker * * Examine conditions and find special ones to later help us determine * what tables can be joined together. Put findings in Special_Conditions @@ -453,66 +506,96 @@ get_plan_nodes_insert(Query *query) * If we encounter a cross-node join, we stop processing and return false, * otherwise true. * - * PGXCTODO: Recognize subqueries, and give up (long term allow safe ones). - * */ static bool -examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_node) +examine_conditions_walker(Node *expr_node, XCWalkerContext *context) { RelationLocInfo *rel_loc_info1, *rel_loc_info2; Const *constant; Expr *checkexpr; + bool result = false; + bool is_and = false; + Assert(!context); + if (expr_node == NULL) - return true; + return false; - if (rtables == NULL) + if (!context->rtables) return true; - if (conditions == NULL) - conditions = new_special_conditions(); + if (!context->conditions) + context->conditions = new_special_conditions(); - if (IsA(expr_node, BoolExpr)) + if (IsA(expr_node, Var)) + { + /* If we get here, that meant the previous call before recursing down did not + * find the condition safe yet. + * Since we pass down our context, this is the bit of code that will detect + * that we are using more than one relation in a condition which has not + * already been deemed safe. + */ + Var *var_node = (Var *) expr_node; + + if (context->varno) + { + if (var_node->varno != context->varno) + return true; + } + else + { + context->varno = var_node->varno; + return false; + } + } + + else if (IsA(expr_node, BoolExpr)) { BoolExpr *boolexpr = (BoolExpr *) expr_node; - /* Recursively handle ANDed expressions, but don't handle others */ if (boolexpr->boolop == AND_EXPR) + is_and = true; + if (boolexpr->boolop == NOT_EXPR) { - if (!examine_conditions(conditions, rtables, - linitial(boolexpr->args))) - return false; + bool save_within_not = context->within_not; + context->within_not = true; - return examine_conditions( - conditions, rtables, lsecond(boolexpr->args)); + if (examine_conditions_walker(linitial(boolexpr->args), context)) + { + context->within_not = save_within_not; + return true; + } + context->within_not = save_within_not; + return false; } else if (boolexpr->boolop == OR_EXPR) { - /* - * look at OR's as work-around for reported issue. - * NOTE: THIS IS NOT CORRECT, BUT JUST DONE FOR THE PROTOTYPE. - * More rigorous - * checking needs to be done. PGXCTODO: Add careful checking for - * OR'ed conditions... - */ - if (!examine_conditions(conditions, rtables, - linitial(boolexpr->args))) - return false; + bool save_within_or = context->within_or; + context->within_or = true; - return examine_conditions( - conditions, rtables, lsecond(boolexpr->args)); - } - else - /* looks complicated, give up */ - return false; + if (examine_conditions_walker(linitial(boolexpr->args), context)) + { + context->within_or = save_within_or; + return true; + } - return true; + if (examine_conditions_walker(lsecond(boolexpr->args), context)) + { + context->within_or = save_within_or; + return true; + } + context->within_or = save_within_or; + return false; + } } - - if (IsA(expr_node, OpExpr)) + /* + * Look for equality conditions on partiioned columns, but only do so + * if we are not in an OR or NOT expression + */ + if (!context->within_or && !context->within_not && IsA(expr_node, OpExpr)) { OpExpr *opexpr = (OpExpr *) expr_node; @@ -528,10 +611,10 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod /* get the RangeTableEntry */ Var *colvar = (Var *) arg1; - ColumnBase *column_base = get_base_var(colvar, rtables); + ColumnBase *column_base = get_base_var(colvar, context); if (!column_base) - return false; + return true; /* Look at other argument */ checkexpr = arg2; @@ -540,7 +623,7 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod if (!IsA(arg2, Const)) { /* this gets freed when the memory context gets freed */ - Expr *eval_expr = eval_const_expressions(NULL, (Node *) arg2); + Expr *eval_expr = (Expr *) eval_const_expressions(NULL, (Node *) arg2); checkexpr = get_numeric_constant(eval_expr); } @@ -555,7 +638,7 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod rel_loc_info1 = GetRelationLocInfo(column_base->relid); if (!rel_loc_info1) - return false; + return true; /* If hash partitioned, check if the part column was used */ if (IsHashColumn(rel_loc_info1, column_base->colname)) @@ -569,18 +652,17 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod lit_comp->col_name = column_base->colname; lit_comp->constant = constant->constvalue; - conditions->partitioned_literal_comps = lappend( - conditions->partitioned_literal_comps, + context->conditions->partitioned_literal_comps = lappend( + context->conditions->partitioned_literal_comps, lit_comp); - return true; + return false; } else { - /* unimportant comparison, just return */ + /* Continue walking below */ if (rel_loc_info1) FreeRelationLocInfo(rel_loc_info1); - return true; } } @@ -593,59 +675,56 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod rel_loc_info1 = GetRelationLocInfo(column_base->relid); if (!rel_loc_info1) - return false; + return true; - column_base2 = get_base_var(colvar2, rtables); + column_base2 = get_base_var(colvar2, context); if (!column_base2) - return false; + return true; rel_loc_info2 = GetRelationLocInfo(column_base2->relid); /* get data struct about these two relations joining */ pgxc_join = find_or_create_pgxc_join(column_base->relid, column_base->relalias, column_base2->relid, column_base2->relalias); - /* - * pgxc_join->condition_list = - * lappend(pgxc_join->condition_list, opexpr); - */ - if (rel_loc_info1->locatorType == LOCATOR_TYPE_REPLICATED) { /* add to replicated join conditions */ - conditions->replicated_joins = - lappend(conditions->replicated_joins, opexpr); + context->conditions->replicated_joins = + lappend(context->conditions->replicated_joins, opexpr); + + if (colvar->varlevelsup != colvar2->varlevelsup) + context->multilevel_join = true; if (rel_loc_info2->locatorType != LOCATOR_TYPE_REPLICATED) { /* Note other relation, saves us work later. */ - conditions->base_rel_name = column_base2->relname; - conditions->base_rel_loc_info = rel_loc_info2; + context->conditions->base_rel_name = column_base2->relname; + context->conditions->base_rel_loc_info = rel_loc_info2; if (rel_loc_info1) FreeRelationLocInfo(rel_loc_info1); } - if (conditions->base_rel_name == NULL) + if (context->conditions->base_rel_name == NULL) { - conditions->base_rel_name = column_base->relname; - conditions->base_rel_loc_info = rel_loc_info1; + context->conditions->base_rel_name = column_base->relname; + context->conditions->base_rel_loc_info = rel_loc_info1; if (rel_loc_info2) FreeRelationLocInfo(rel_loc_info2); } /* note nature of join between the two relations */ pgxc_join->join_type = JOIN_REPLICATED; - return true; + return false; } - - if (rel_loc_info2->locatorType == LOCATOR_TYPE_REPLICATED) + else if (rel_loc_info2->locatorType == LOCATOR_TYPE_REPLICATED) { /* add to replicated join conditions */ - conditions->replicated_joins = - lappend(conditions->replicated_joins, opexpr); + context->conditions->replicated_joins = + lappend(context->conditions->replicated_joins, opexpr); /* other relation not replicated, note it for later */ - conditions->base_rel_name = column_base->relname; - conditions->base_rel_loc_info = rel_loc_info1; + context->conditions->base_rel_name = column_base->relname; + context->conditions->base_rel_loc_info = rel_loc_info1; /* note nature of join between the two relations */ pgxc_join->join_type = JOIN_REPLICATED; @@ -653,11 +732,9 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod if (rel_loc_info2) FreeRelationLocInfo(rel_loc_info2); - return true; + return false; } - /* Now check for a partitioned join */ - /* * PGXCTODO - for the prototype, we assume all partitioned * tables are on the same nodes. @@ -666,36 +743,113 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod && IsHashColumn(rel_loc_info2, column_base2->colname)) { /* We found a partitioned join */ - conditions->partitioned_parent_child = - lappend(conditions->partitioned_parent_child, - opexpr); + Parent_Child_Join *parent_child = (Parent_Child_Join *) + palloc0(sizeof(Parent_Child_Join)); + + parent_child->rel_loc_info1 = rel_loc_info1; + parent_child->rel_loc_info2 = rel_loc_info2; + parent_child->opexpr = opexpr; + + context->conditions->partitioned_parent_child = + lappend(context->conditions->partitioned_parent_child, + parent_child); pgxc_join->join_type = JOIN_COLOCATED_PARTITIONED; - return true; + if (colvar->varlevelsup != colvar2->varlevelsup) + context->multilevel_join = true; + return false; } /* * At this point, there is some other type of join that * can probably not be executed on only a single node. - * Just return. Important: We preserve previous + * Just return, as it may be updated later. + * Important: We preserve previous * pgxc_join->join_type value, there may be multiple * columns joining two tables, and we want to make sure at * least one of them make it colocated partitioned, in * which case it will update it when examining another * condition. */ - return true; + return false; + } + } + } + } + + /* Handle subquery */ + if (IsA(expr_node, SubLink)) + { + List *current_rtable; + bool is_multilevel; + int save_parent_child_count = 0; + SubLink *sublink = (SubLink *) expr_node; + Exec_Nodes *save_exec_nodes = context->exec_nodes; /* Save old exec_nodes */ + + /* save parent-child count */ + if (context->exec_nodes) + save_parent_child_count = list_length(context->conditions->partitioned_parent_child); + + context->exec_nodes = NULL; + context->multilevel_join = false; + current_rtable = ((Query *) sublink->subselect)->rtable; + + /* push onto rtables list before recursing */ + context->rtables = lappend(context->rtables, current_rtable); + + if (get_plan_nodes_walker(sublink->subselect, context)) + return true; + + /* pop off (remove) rtable */ + context->rtables = list_delete_ptr(context->rtables, current_rtable); + + is_multilevel = context->multilevel_join; + context->multilevel_join = false; + + /* Allow for replicated tables */ + if (!context->exec_nodes) + context->exec_nodes = save_exec_nodes; + else + { + if (save_exec_nodes) + { + if (context->exec_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED) + { + context->exec_nodes = save_exec_nodes; } else + { + if (save_exec_nodes->tableusagetype != TABLE_USAGE_TYPE_USER_REPLICATED) + { + /* See if they run on the same node */ + if (same_single_node (context->exec_nodes->nodelist, save_exec_nodes->nodelist)) + return false; + } + else + /* use old value */ + context->exec_nodes = save_exec_nodes; + } + } else + { + if (context->exec_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED) + return false; + /* See if subquery safely joins with parent */ + if (!is_multilevel) return true; - } } - /* PGXCTODO - need to more finely examine other operators */ } - return true; + /* Keep on walking */ + result = expression_tree_walker(expr_node, examine_conditions_walker, (void *) context); + + /* Reset context->varno if is_and to detect cross-node operations */ + if (is_and) + context->varno = 0; + + return result; } + /* * examine_conditions_fromlist - Examine FROM clause for joins * @@ -703,46 +857,42 @@ examine_conditions(Special_Conditions *conditions, List *rtables, Node *expr_nod * to help us decide which nodes to execute on. */ static bool -examine_conditions_fromlist(Special_Conditions *conditions, List *rtables, - Node *treenode) +examine_conditions_fromlist(Node *treenode, XCWalkerContext *context) { - if (treenode == NULL) - return true; - - if (rtables == NULL) - return true; + return false; - if (conditions == NULL) - conditions = new_special_conditions(); + if (context->rtables == NULL) + return false; if (IsA(treenode, JoinExpr)) { JoinExpr *joinexpr = (JoinExpr *) treenode; /* recursively examine FROM join tree */ - if (!examine_conditions_fromlist(conditions, rtables, joinexpr->larg)) - return false; + if (examine_conditions_fromlist(joinexpr->larg, context)) + return true; - if (!examine_conditions_fromlist(conditions, rtables, joinexpr->rarg)) - return false; + if (examine_conditions_fromlist(joinexpr->rarg, context)) + return true; /* Now look at join condition */ - if (!examine_conditions(conditions, rtables, joinexpr->quals)) - return false; - return true; + if (examine_conditions_walker(joinexpr->quals, context)) + return true; + + return false; } else if (IsA(treenode, RangeTblRef)) - return true; + return false; else if (IsA(treenode, BoolExpr) ||IsA(treenode, OpExpr)) { /* check base condition, if possible */ - if (!examine_conditions(conditions, rtables, treenode)) - return false; + if (examine_conditions_walker(treenode, context)); + return true; } /* Some other more complicated beast */ - return false; + return true; } @@ -779,18 +929,15 @@ contains_only_pg_catalog (List *rtable) * * returns NULL if it appears to be a mutli-step query. */ -static Exec_Nodes * -get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) +static bool +get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) { + Query *query; RangeTblEntry *rte; ListCell *lc, *item; - Special_Conditions *special_conditions; - OpExpr *opexpr; - Var *colvar; RelationLocInfo *rel_loc_info; Exec_Nodes *test_exec_nodes = NULL; - Exec_Nodes *exec_nodes = NULL; Exec_Nodes *current_nodes = NULL; Exec_Nodes *from_query_nodes = NULL; TableUsageType table_usage_type = TABLE_USAGE_TYPE_NO_TABLE; @@ -798,15 +945,14 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) int from_subquery_count = 0; - exec_nodes = NULL; - join_list = NULL; + if (!query_node && !IsA(query_node,Query)) + return true; + + query = (Query *) query_node; /* If no tables, just return */ if (query->rtable == NULL && query->jointree == NULL) - return NULL; - - /* Alloc and init struct */ - special_conditions = new_special_conditions(); + return false; /* Look for special conditions */ @@ -817,22 +963,19 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) if (IsA(treenode, JoinExpr)) { - if (!examine_conditions_fromlist(special_conditions, query->rtable, - treenode)) + if (examine_conditions_fromlist(treenode, context)) { /* May be complicated. Before giving up, just check for pg_catalog usage */ if (contains_only_pg_catalog (query->rtable)) { /* just pg_catalog tables */ - exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); - exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; - free_special_relations(special_conditions); - return exec_nodes; + context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; + return false; } /* complicated */ - free_special_relations(special_conditions); - return NULL; + return true; } } else if (IsA(treenode, RangeTblRef)) @@ -844,20 +987,34 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) if (rte->rtekind == RTE_SUBQUERY) { + Exec_Nodes *save_exec_nodes = context->exec_nodes; + Special_Conditions *save_conditions = context->conditions; /* Save old conditions */ + List *current_rtable = rte->subquery->rtable; + from_subquery_count++; + /* * Recursively call for subqueries. * Note this also works for views, which are rewritten as subqueries. */ - current_nodes = get_plan_nodes(query_plan, rte->subquery, isRead); + context->rtables = lappend(context->rtables, current_rtable); + context->conditions = (Special_Conditions *) palloc0(sizeof(Special_Conditions)); + + if (get_plan_nodes_walker((Node *) rte->subquery, context)) + return true; + + /* restore rtables and conditions */ + context->rtables = list_delete_ptr(context->rtables, current_rtable); + context->conditions = save_conditions; + + current_nodes = context->exec_nodes; + context->exec_nodes = save_exec_nodes; + if (current_nodes) current_usage_type = current_nodes->tableusagetype; else - { /* could be complicated */ - free_special_relations(special_conditions); - return NULL; - } + return true; /* We compare to make sure that the subquery is safe to execute with previous- * we may have multiple ones in the FROM clause. @@ -880,11 +1037,8 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) { /* Allow if they are both using one node, and the same one */ if (!same_single_node (from_query_nodes->nodelist, current_nodes->nodelist)) - { /* Complicated */ - free_special_relations(special_conditions); - return NULL; - } + return true; } } } @@ -904,18 +1058,13 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) current_usage_type = TABLE_USAGE_TYPE_PGCATALOG; else { - //current_usage_type = TABLE_USAGE_TYPE_USER; /* Complicated */ - free_special_relations(special_conditions); - return NULL; + return true; } } else - { /* could be complicated */ - free_special_relations(special_conditions); - return NULL; - } + return true; /* See if we have pg_catalog mixed with other tables */ if (table_usage_type == TABLE_USAGE_TYPE_NO_TABLE) @@ -923,34 +1072,27 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) else if (current_usage_type != table_usage_type) { /* mixed- too complicated for us for now */ - free_special_relations(special_conditions); - return NULL; + return true; } } else { /* could be complicated */ - free_special_relations(special_conditions); - return NULL; + return true; } } /* If we are just dealing with pg_catalog, just return */ if (table_usage_type == TABLE_USAGE_TYPE_PGCATALOG) { - exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); - exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; - return exec_nodes; + context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; + return false; } /* Examine the WHERE clause, too */ - if (!examine_conditions(special_conditions, query->rtable, - query->jointree->quals)) - { - /* if cross joins may exist, just return NULL */ - free_special_relations(special_conditions); - return NULL; - } + if (examine_conditions_walker(query->jointree->quals, context)) + return true; /* Examine join conditions, see if each join is single-node safe */ if (join_list != NULL) @@ -961,17 +1103,14 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) /* If it is not replicated or parent-child, not single-node safe */ if (pgxcjoin->join_type == JOIN_OTHER) - { - free_special_relations(special_conditions); - return NULL; - } + return true; } } /* check for non-partitioned cases */ - if (special_conditions->partitioned_parent_child == NULL && - special_conditions->partitioned_literal_comps == NULL) + if (context->conditions->partitioned_parent_child == NULL && + context->conditions->partitioned_literal_comps == NULL) { /* * We have either a single table, just replicated tables, or a @@ -980,7 +1119,7 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) */ /* See if we noted a table earlier to use */ - rel_loc_info = special_conditions->base_rel_loc_info; + rel_loc_info = context->conditions->base_rel_loc_info; if (rel_loc_info == NULL) { @@ -994,7 +1133,7 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) * If the query is rewritten (which can be due to rules or views), * ignore extra stuff. Also ignore subqueries we have processed */ - if (!rte->inFromCl || rte->rtekind != RTE_RELATION) + if ((!rte->inFromCl && query->commandType == CMD_SELECT) || rte->rtekind != RTE_RELATION) continue; /* PGXCTODO - handle RTEs that are functions */ @@ -1003,7 +1142,7 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) * Too complicated, we have multiple relations that still * cannot be joined safely */ - return NULL; + return true; rtesave = rte; } @@ -1014,35 +1153,35 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) rel_loc_info = GetRelationLocInfo(rtesave->relid); if (!rel_loc_info) - return NULL; + return true; - exec_nodes = GetRelationNodes(rel_loc_info, NULL, isRead); + context->exec_nodes = GetRelationNodes(rel_loc_info, NULL, context->isRead); } } else { - exec_nodes = GetRelationNodes(rel_loc_info, NULL, isRead); + context->exec_nodes = GetRelationNodes(rel_loc_info, NULL, context->isRead); } /* Note replicated table usage for determining safe queries */ - if (exec_nodes) + if (context->exec_nodes) { if (table_usage_type == TABLE_USAGE_TYPE_USER && IsReplicated(rel_loc_info)) table_usage_type = TABLE_USAGE_TYPE_USER_REPLICATED; - else - exec_nodes->tableusagetype = table_usage_type; + + context->exec_nodes->tableusagetype = table_usage_type; } } /* check for partitioned col comparison against a literal */ - else if (list_length(special_conditions->partitioned_literal_comps) > 0) + else if (list_length(context->conditions->partitioned_literal_comps) > 0) { - exec_nodes = NULL; + context->exec_nodes = NULL; /* * Make sure that if there are multiple such comparisons, that they * are all on the same nodes. */ - foreach(lc, special_conditions->partitioned_literal_comps) + foreach(lc, context->conditions->partitioned_literal_comps) { Literal_Comparison *lit_comp = (Literal_Comparison *) lfirst(lc); @@ -1050,14 +1189,13 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) lit_comp->rel_loc_info, &(lit_comp->constant), true); test_exec_nodes->tableusagetype = table_usage_type; - if (exec_nodes == NULL) - exec_nodes = test_exec_nodes; + if (context->exec_nodes == NULL) + context->exec_nodes = test_exec_nodes; else { - if (!same_single_node(exec_nodes->nodelist, test_exec_nodes->nodelist)) + if (!same_single_node(context->exec_nodes->nodelist, test_exec_nodes->nodelist)) { - free_special_relations(special_conditions); - return NULL; + return true; } } } @@ -1069,67 +1207,87 @@ get_plan_nodes(Query_Plan *query_plan, Query *query, bool isRead) * no partitioned column comparison condition with a literal. We just * use one of the tables as a basis for node determination. */ - ColumnBase *column_base; - - opexpr = (OpExpr *) linitial(special_conditions->partitioned_parent_child); + Parent_Child_Join *parent_child; - colvar = (Var *) linitial(opexpr->args); + parent_child = (Parent_Child_Join *) + linitial(context->conditions->partitioned_parent_child); - /* get the RangeTableEntry */ - column_base = get_base_var(colvar, query->rtable); - if (!column_base) - return false; - - rel_loc_info = GetRelationLocInfo(column_base->relid); - if (!rel_loc_info) - return false; - - exec_nodes = GetRelationNodes(rel_loc_info, NULL, isRead); - exec_nodes->tableusagetype = table_usage_type; + context->exec_nodes = GetRelationNodes(parent_child->rel_loc_info1, NULL, context->isRead); + context->exec_nodes->tableusagetype = table_usage_type; } - free_special_relations(special_conditions); if (from_query_nodes) { - if (!exec_nodes) - return from_query_nodes; + if (!context->exec_nodes) + { + context->exec_nodes = from_query_nodes; + return false; + } /* Just use exec_nodes if the from subqueries are all replicated or using the exact * same node */ else if (from_query_nodes->tableusagetype == TABLE_USAGE_TYPE_USER_REPLICATED - || (same_single_node(from_query_nodes->nodelist, exec_nodes->nodelist))) - return exec_nodes; + || (same_single_node(from_query_nodes->nodelist, context->exec_nodes->nodelist))) + return false; else { - /* We allow views, where the (rewritten) subquery may be on all nodes, but the parent - * query applies a condition on the from subquery. + /* We allow views, where the (rewritten) subquery may be on all nodes, + * but the parent query applies a condition on the from subquery. */ if (list_length(query->jointree->fromlist) == from_subquery_count - && list_length(exec_nodes->nodelist) == 1) - return exec_nodes; + && list_length(context->exec_nodes->nodelist) == 1) + return false; } /* Too complicated, give up */ - return NULL; + return true; } - return exec_nodes; + return false; +} + + +/* + * Top level entry point before walking query to determine plan nodes + * + */ +static Exec_Nodes * +get_plan_nodes(Query *query, bool isRead) +{ + Exec_Nodes *result_nodes; + XCWalkerContext *context = palloc0(sizeof(XCWalkerContext)); + + context->query = query; + context->isRead = isRead; + + context->conditions = (Special_Conditions *) palloc0(sizeof(Special_Conditions)); + context->rtables = lappend(context->rtables, query->rtable); + + join_list = NULL; + + if (get_plan_nodes_walker((Node *) query, context)) + result_nodes = NULL; + else + result_nodes = context->exec_nodes; + + free_special_relations(context->conditions); + return result_nodes; } /* - * get_plan_nodes - determine the nodes to execute the plan on + * get_plan_nodes_command - determine the nodes to execute the plan on * * return NULL if it is not safe to be done in a single step. */ static Exec_Nodes * -get_plan_nodes_command(Query_Plan *query_plan, Query *query) +get_plan_nodes_command(Query *query) { Exec_Nodes *exec_nodes = NULL; switch (query->commandType) { case CMD_SELECT: - exec_nodes = get_plan_nodes(query_plan, query, true); + exec_nodes = get_plan_nodes(query, true); break; case CMD_INSERT: @@ -1139,7 +1297,7 @@ get_plan_nodes_command(Query_Plan *query_plan, Query *query) case CMD_UPDATE: case CMD_DELETE: /* treat as a select */ - exec_nodes = get_plan_nodes(query_plan, query, false); + exec_nodes = get_plan_nodes(query, false); break; default: @@ -1182,7 +1340,6 @@ get_plan_combine_type(Query *query, char baselocatortype) /* * Get list of simple aggregates used. - * For now we only allow MAX in the first column, and return a list of one. */ static List * get_simple_aggregates(Query * query) @@ -1439,11 +1596,14 @@ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) } query_step->exec_nodes = - get_plan_nodes_command(query_plan, query); + get_plan_nodes_command(query); if (query_step->exec_nodes) query_step->combine_type = get_plan_combine_type( query, query_step->exec_nodes->baselocatortype); - query_step->simple_aggregates = get_simple_aggregates(query); + /* Only set up if running on more than one node */ + if (query_step->exec_nodes && query_step->exec_nodes->nodelist && + list_length(query_step->exec_nodes->nodelist) > 1) + query_step->simple_aggregates = get_simple_aggregates(query); /* * See if it is a SELECT with no relations, like SELECT 1+1 or ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 584 +++++++++++++++++++++++++-------------- 1 files changed, 372 insertions(+), 212 deletions(-) hooks/post-receive -- Postgres-XC |
From: Pavan D. <pa...@us...> - 2010-06-24 08:16:32
|
Project "Postgres-XC". The branch, master has been updated via 75127cbf9ff834aabc4e4f39f2628f7a9646a6ea (commit) from c0169fa52ff019450c45dd9e50502e12375f33f2 (commit) - Log ----------------------------------------------------------------- commit 75127cbf9ff834aabc4e4f39f2628f7a9646a6ea Author: Pavan Deolasee <pav...@gm...> Date: Thu Jun 24 13:45:29 2010 +0530 Add a missing include file from the previous commit diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index b5bb7d9..f54f74f 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -18,6 +18,9 @@ #include "access/xlog.h" #include "fmgr.h" +#ifdef PGXC +#include "utils/relcache.h" +#endif /* * On a machine with no 64-bit-int C datatype, sizeof(int64) will not be 8, ----------------------------------------------------------------------- Summary of changes: src/include/commands/sequence.h | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) hooks/post-receive -- Postgres-XC |