summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael P2010-04-01 01:09:52 +0000
committerPavan Deolasee2011-05-19 16:38:44 +0000
commit9b1cd1ef2e746b9d68085ecd37eabaa38e2a82f1 (patch)
treef220dc274f1d69eb685e822b9079e829525f5d4a
parent4d53a2f9699547bdc12831d2860c9d44c465e805 (diff)
Postgres-XC version 0.9
Application of patch PGXC-PG_REL8_4_3.patch.gz on PostgreSQL version 8.4.3
-rw-r--r--contrib/Makefile2
-rw-r--r--src/Makefile4
-rw-r--r--src/backend/Makefile22
-rw-r--r--src/backend/access/transam/Makefile5
-rw-r--r--src/backend/access/transam/clog.c32
-rw-r--r--src/backend/access/transam/gtm.c226
-rw-r--r--src/backend/access/transam/subtrans.c25
-rw-r--r--src/backend/access/transam/twophase.c5
-rw-r--r--src/backend/access/transam/varsup.c143
-rw-r--r--src/backend/access/transam/xact.c128
-rw-r--r--src/backend/bootstrap/bootstrap.c17
-rw-r--r--src/backend/catalog/Makefile5
-rw-r--r--src/backend/catalog/dependency.c7
-rw-r--r--src/backend/catalog/heap.c141
-rw-r--r--src/backend/catalog/pgxc_class.c105
-rw-r--r--src/backend/commands/indexcmds.c29
-rw-r--r--src/backend/commands/sequence.c61
-rw-r--r--src/backend/commands/tablecmds.c27
-rw-r--r--src/backend/commands/vacuum.c33
-rw-r--r--src/backend/nodes/copyfuncs.c23
-rw-r--r--src/backend/nodes/equalfuncs.c4
-rw-r--r--src/backend/nodes/readfuncs.c17
-rw-r--r--src/backend/parser/gram.y152
-rw-r--r--src/backend/parser/parse_utilcmd.c225
-rw-r--r--src/backend/pgxc/Makefile16
-rw-r--r--src/backend/pgxc/locator/Makefile20
-rw-r--r--src/backend/pgxc/locator/locator.c607
-rw-r--r--src/backend/pgxc/plan/Makefile20
-rw-r--r--src/backend/pgxc/plan/planner.c1290
-rw-r--r--src/backend/pgxc/pool/Makefile19
-rw-r--r--src/backend/pgxc/pool/combiner.c375
-rw-r--r--src/backend/pgxc/pool/datanode.c1701
-rw-r--r--src/backend/pgxc/pool/poolcomm.c614
-rw-r--r--src/backend/pgxc/pool/poolmgr.c1403
-rw-r--r--src/backend/postmaster/postmaster.c163
-rw-r--r--src/backend/storage/ipc/procarray.c330
-rw-r--r--src/backend/tcop/postgres.c344
-rw-r--r--src/backend/tcop/utility.c4
-rw-r--r--src/backend/utils/cache/relcache.c11
-rw-r--r--src/backend/utils/cache/syscache.c18
-rw-r--r--src/backend/utils/init/miscinit.c1
-rw-r--r--src/backend/utils/misc/guc.c169
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample54
-rw-r--r--src/bin/initdb/initdb.c28
-rw-r--r--src/bin/pg_ctl/pg_ctl.c66
-rw-r--r--src/gtm/Makefile43
-rw-r--r--src/gtm/Makefile.global116
-rw-r--r--src/gtm/Makefile.port16
-rw-r--r--src/gtm/Makefile.shlib556
-rw-r--r--src/gtm/README61
-rw-r--r--src/gtm/client/Makefile26
-rw-r--r--src/gtm/client/fe-connect.c1287
-rw-r--r--src/gtm/client/fe-misc.c1035
-rw-r--r--src/gtm/client/fe-protocol.c598
-rw-r--r--src/gtm/client/gtm_client.c515
-rw-r--r--src/gtm/client/ip.c324
-rw-r--r--src/gtm/client/pqexpbuffer.c373
-rw-r--r--src/gtm/client/strlcpy.c72
-rw-r--r--src/gtm/client/test/Makefile31
-rw-r--r--src/gtm/client/test/test_proxy.sh196
-rw-r--r--src/gtm/client/test/test_seq.c113
-rw-r--r--src/gtm/client/test/test_snap.c88
-rw-r--r--src/gtm/client/test/test_snapperf.c67
-rw-r--r--src/gtm/client/test/test_txn.c72
-rw-r--r--src/gtm/client/test/test_txnperf.c286
-rw-r--r--src/gtm/common/Makefile25
-rw-r--r--src/gtm/common/aset.c1261
-rw-r--r--src/gtm/common/assert.c54
-rw-r--r--src/gtm/common/elog.c1117
-rw-r--r--src/gtm/common/gtm_list.c863
-rw-r--r--src/gtm/common/gtm_lock.c206
-rw-r--r--src/gtm/common/mcxt.c763
-rw-r--r--src/gtm/common/stringinfo.c280
-rw-r--r--src/gtm/gtm_ctl/Makefile22
-rw-r--r--src/gtm/gtm_ctl/gtm_ctl.c918
-rw-r--r--src/gtm/libpq/Makefile22
-rw-r--r--src/gtm/libpq/ip.c324
-rw-r--r--src/gtm/libpq/pqcomm.c1130
-rw-r--r--src/gtm/libpq/pqformat.c658
-rw-r--r--src/gtm/libpq/pqsignal.c181
-rw-r--r--src/gtm/libpq/strlcpy.c72
-rw-r--r--src/gtm/main/Makefile22
-rw-r--r--src/gtm/main/gtm_seq.c867
-rw-r--r--src/gtm/main/gtm_snap.c466
-rw-r--r--src/gtm/main/gtm_stat.c37
-rw-r--r--src/gtm/main/gtm_stats.c23
-rw-r--r--src/gtm/main/gtm_thread.c336
-rw-r--r--src/gtm/main/gtm_txn.c1521
-rw-r--r--src/gtm/main/main.c1370
-rw-r--r--src/gtm/path/Makefile21
-rw-r--r--src/gtm/path/path.c177
-rw-r--r--src/gtm/proxy/Makefile22
-rw-r--r--src/gtm/proxy/proxy_main.c2016
-rw-r--r--src/gtm/proxy/proxy_thread.c451
-rw-r--r--src/include/access/gtm.h33
-rw-r--r--src/include/access/transam.h6
-rw-r--r--src/include/access/xact.h8
-rw-r--r--src/include/bootstrap/bootstrap.h4
-rw-r--r--src/include/catalog/dependency.h4
-rw-r--r--src/include/catalog/heap.h8
-rw-r--r--src/include/catalog/indexing.h6
-rw-r--r--src/include/catalog/pgxc_class.h39
-rw-r--r--src/include/gtm/assert.h72
-rw-r--r--src/include/gtm/elog.h253
-rw-r--r--src/include/gtm/gtm.h140
-rw-r--r--src/include/gtm/gtm_c.h101
-rw-r--r--src/include/gtm/gtm_client.h129
-rw-r--r--src/include/gtm/gtm_conn.h38
-rw-r--r--src/include/gtm/gtm_ext.h31
-rw-r--r--src/include/gtm/gtm_ip.h50
-rw-r--r--src/include/gtm/gtm_list.h280
-rw-r--r--src/include/gtm/gtm_lock.h59
-rw-r--r--src/include/gtm/gtm_msg.h88
-rw-r--r--src/include/gtm/gtm_proxy.h221
-rw-r--r--src/include/gtm/gtm_seq.h75
-rw-r--r--src/include/gtm/gtm_txn.h235
-rw-r--r--src/include/gtm/ip.h50
-rw-r--r--src/include/gtm/libpq-be.h86
-rw-r--r--src/include/gtm/libpq-fe.h138
-rw-r--r--src/include/gtm/libpq-int.h129
-rw-r--r--src/include/gtm/libpq.h47
-rw-r--r--src/include/gtm/memnodes.h79
-rw-r--r--src/include/gtm/memutils.h123
-rw-r--r--src/include/gtm/palloc.h90
-rw-r--r--src/include/gtm/path.h16
-rw-r--r--src/include/gtm/pqcomm.h57
-rw-r--r--src/include/gtm/pqexpbuffer.h181
-rw-r--r--src/include/gtm/pqformat.h48
-rw-r--r--src/include/gtm/pqsignal.h49
-rw-r--r--src/include/gtm/stringinfo.h149
-rw-r--r--src/include/nodes/nodes.h5
-rw-r--r--src/include/nodes/parsenodes.h17
-rw-r--r--src/include/nodes/primnodes.h27
-rw-r--r--src/include/parser/kwlist.h19
-rw-r--r--src/include/parser/parse_utilcmd.h4
-rw-r--r--src/include/pgxc/combiner.h63
-rw-r--r--src/include/pgxc/datanode.h76
-rw-r--r--src/include/pgxc/locator.h66
-rw-r--r--src/include/pgxc/pgxc.h23
-rw-r--r--src/include/pgxc/planner.h86
-rw-r--r--src/include/pgxc/poolcomm.h49
-rw-r--r--src/include/pgxc/poolmgr.h130
-rw-r--r--src/include/postgres.h4
-rw-r--r--src/include/postmaster/autovacuum.h5
-rw-r--r--src/include/storage/proc.h3
-rw-r--r--src/include/storage/procarray.h5
-rw-r--r--src/include/utils/guc_tables.h4
-rw-r--r--src/include/utils/rel.h7
-rw-r--r--src/include/utils/snapshot.h5
-rw-r--r--src/include/utils/syscache.h4
150 files changed, 33460 insertions, 55 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index e840c8ce6a..f3777962c5 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -39,7 +39,7 @@ WANTED_DIRS = \
tablefunc \
test_parser \
tsearch2 \
- vacuumlo
+ vacuumlo
ifeq ($(with_openssl),yes)
WANTED_DIRS += sslinfo
diff --git a/src/Makefile b/src/Makefile
index 7b00776c4b..02ba3b3926 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,6 +16,8 @@ include Makefile.global
all install installdirs uninstall distprep:
$(MAKE) -C port $@
$(MAKE) -C timezone $@
+ # GTM should be built before backend because of dependancy
+ $(MAKE) -C gtm $@
$(MAKE) -C backend $@
$(MAKE) -C backend/utils/mb/conversion_procs $@
$(MAKE) -C backend/snowball $@
@@ -47,6 +49,7 @@ uninstall-local:
clean:
$(MAKE) -C port $@
$(MAKE) -C timezone $@
+ $(MAKE) -C gtm $@
$(MAKE) -C backend $@
$(MAKE) -C backend/snowball $@
$(MAKE) -C include $@
@@ -61,6 +64,7 @@ clean:
distclean maintainer-clean:
$(MAKE) -C port $@
$(MAKE) -C timezone $@
+ $(MAKE) -C gtm $@
$(MAKE) -C backend $@
$(MAKE) -C backend/snowball $@
$(MAKE) -C include $@
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 86526d5f1a..4ae230dbd5 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -15,8 +15,8 @@ top_builddir = ../..
include $(top_builddir)/src/Makefile.global
SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
- main nodes optimizer port postmaster regex rewrite \
- storage tcop tsearch utils $(top_builddir)/src/timezone
+ pgxc main nodes optimizer port postmaster regex rewrite \
+ storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
include $(srcdir)/common.mk
@@ -26,7 +26,19 @@ LOCALOBJS += utils/probes.o
endif
endif
-OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a
+OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
+ $(top_builddir)/src/interfaces/libpq/fe-connect.o \
+ $(top_builddir)/src/interfaces/libpq/fe-secure.o \
+ $(top_builddir)/src/interfaces/libpq/fe-misc.o \
+ $(top_builddir)/src/interfaces/libpq/fe-protocol3.o \
+ $(top_builddir)/src/interfaces/libpq/fe-protocol2.o \
+ $(top_builddir)/src/interfaces/libpq/fe-exec.o \
+ $(top_builddir)/src/interfaces/libpq/fe-auth.o \
+ $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
+ $(top_builddir)/src/port/libpgport_srv.a \
+ $(top_builddir)/src/gtm/client/libgtmclient.a \
+ $(top_builddir)/src/gtm/common/libgtm.a \
+ $(top_builddir)/src/gtm/libpq/libpqcomm.a
# We put libpgport into OBJS, so remove it from LIBS; also add libldap
LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
@@ -34,6 +46,8 @@ LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
# The backend doesn't need everything that's in LIBS, however
LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS))
+# LIBS := $(LIBS) -lpqcomm
+# LDFLAGS += -L$(top_builddir)/src/gtm/libpg
##########################################################################
all: submake-libpgport postgres $(POSTGRES_IMP)
@@ -43,7 +57,7 @@ ifneq ($(PORTNAME), win32)
ifneq ($(PORTNAME), aix)
postgres: $(OBJS)
- $(CC) $(CFLAGS) $(LDFLAGS) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
+ $(CC) $(CFLAGS) $(LDFLAGS) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
endif
endif
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 38cfe1a277..fe34e4eaaa 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -12,9 +12,12 @@ subdir = src/backend/access/transam
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o
+OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o gtm.o
include $(top_srcdir)/src/backend/common.mk
# ensure that version checks in xlog.c get recompiled when catversion.h changes
xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h
+
+libpg-fe.h:
+ $(LN_S) $(top_builddir)/contrib/gtm/client/libpg-fe.h $(top_srcdir)/src/include/
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 8544725abb..8dc23f7039 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -25,6 +25,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.53 2009/06/11 14:48:54 momjian Exp $
*
@@ -67,6 +68,11 @@
#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
+#ifdef PGXC
+/* Check if there is about a 1 billion XID difference for XID wraparound */
+#define CLOG_WRAP_CHECK_DELTA (2^30 / CLOG_XACTS_PER_PAGE)
+#endif
+
/*
* Link to shared-memory data structures for CLOG control
@@ -150,6 +156,11 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
Assert(status == TRANSACTION_STATUS_COMMITTED ||
status == TRANSACTION_STATUS_ABORTED);
+ if (status == TRANSACTION_STATUS_COMMITTED)
+ elog(DEBUG1, "Record transaction commit %u", xid);
+ else
+ elog(DEBUG1, "Record transaction abort %u", xid);
+
/*
* See how many subxids, if any, are on the same page as the parent, if
* any.
@@ -565,11 +576,31 @@ ExtendCLOG(TransactionId newestXact)
* No work except at first XID of a page. But beware: just after
* wraparound, the first XID of page zero is FirstNormalTransactionId.
*/
+#ifdef PGXC /* PGXC_COORD || PGXC_DATANODE */
+ /*
+ * In PGXC, it may be that a node is not involved in a transaction,
+ * and therefore will be skipped, so we need to detect this by using
+ * the latest_page_number instead of the pg index.
+ *
+ * Also, there is a special case of when transactions wrap-around that
+ * we need to detect.
+ */
+ pageno = TransactionIdToPage(newestXact);
+
+ /*
+ * The first condition makes sure we did not wrap around
+ * The second checks if we are still using the same page
+ */
+ if (ClogCtl->shared->latest_page_number - pageno <= CLOG_WRAP_CHECK_DELTA
+ && pageno <= ClogCtl->shared->latest_page_number)
+ return;
+#else
if (TransactionIdToPgIndex(newestXact) != 0 &&
!TransactionIdEquals(newestXact, FirstNormalTransactionId))
return;
pageno = TransactionIdToPage(newestXact);
+#endif
LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
@@ -579,7 +610,6 @@ ExtendCLOG(TransactionId newestXact)
LWLockRelease(CLogControlLock);
}
-
/*
* Remove all CLOG segments before the one holding the passed transaction ID
*
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
new file mode 100644
index 0000000000..2ecc96a4ac
--- /dev/null
+++ b/src/backend/access/transam/gtm.c
@@ -0,0 +1,226 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.c
+ *
+ * Module interfacing with GTM
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "utils/elog.h"
+
+/* Configuration variables */
+char *GtmHost = "localhost";
+int GtmPort = 6666;
+int GtmCoordinatorId = 1;
+
+extern bool FirstSnapshotSet;
+
+static GTM_Conn *conn;
+
+#define CheckConnection() \
+ if (GTMPQstatus(conn) != CONNECTION_OK) InitGTM()
+
+
+bool IsGTMConnected()
+{
+ return conn != NULL;
+}
+
+void
+InitGTM()
+{
+ /* 256 bytes should be enough */
+ char conn_str[256];
+
+ sprintf(conn_str, "host=%s port=%d coordinator_id=%d", GtmHost, GtmPort, GtmCoordinatorId);
+
+ conn = PQconnectGTM(conn_str);
+ if (GTMPQstatus(conn) != CONNECTION_OK)
+ {
+ int save_errno = errno;
+
+ ereport(WARNING,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("can not connect to GTM: %m")));
+
+ errno = save_errno;
+
+ CloseGTM();
+ }
+}
+
+void
+CloseGTM()
+{
+ GTMPQfinish(conn);
+ conn = NULL;
+}
+
+GlobalTransactionId
+BeginTranGTM()
+{
+ GlobalTransactionId xid = InvalidGlobalTransactionId;
+
+ CheckConnection();
+ // TODO Isolation level
+ if (conn)
+ xid = begin_transaction(conn, GTM_ISOLATION_RC);
+
+ /* If something went wrong (timeout), try and reset GTM connection
+ * and retry. This is safe at the beginning of a transaction.
+ */
+ if (!TransactionIdIsValid(xid))
+ {
+ CloseGTM();
+ InitGTM();
+ if (conn)
+ xid = begin_transaction(conn, GTM_ISOLATION_RC);
+ }
+ return xid;
+}
+
+GlobalTransactionId
+BeginTranAutovacuumGTM()
+{
+ GlobalTransactionId xid = InvalidGlobalTransactionId;
+
+ CheckConnection();
+ // TODO Isolation level
+ if (conn)
+ xid = begin_transaction_autovacuum(conn, GTM_ISOLATION_RC);
+
+ /* If something went wrong (timeout), try and reset GTM connection and retry.
+ * This is safe at the beginning of a transaction.
+ */
+ if (!TransactionIdIsValid(xid))
+ {
+ CloseGTM();
+ InitGTM();
+ if (conn)
+ xid = begin_transaction_autovacuum(conn, GTM_ISOLATION_RC);
+ }
+ return xid;
+}
+
+int
+CommitTranGTM(GlobalTransactionId gxid)
+{
+ int ret;
+
+ if (!GlobalTransactionIdIsValid(gxid))
+ return 0;
+ CheckConnection();
+ ret = commit_transaction(conn, gxid);
+
+ /* If something went wrong (timeout), try and reset GTM connection.
+ * We will close the transaction locally anyway, and closing GTM will force
+ * it to be closed on GTM.
+ */
+ if (ret < 0)
+ {
+ CloseGTM();
+ InitGTM();
+ }
+ return ret;
+}
+
+int
+RollbackTranGTM(GlobalTransactionId gxid)
+{
+ int ret;
+
+ if (!GlobalTransactionIdIsValid(gxid))
+ return 0;
+ CheckConnection();
+ ret = abort_transaction(conn, gxid);
+
+ /* If something went wrong (timeout), try and reset GTM connection.
+ * We will abort the transaction locally anyway, and closing GTM will force
+ * it to end on GTM.
+ */
+ if (ret < 0)
+ {
+ CloseGTM();
+ InitGTM();
+ }
+ return ret;
+}
+
+GTM_Snapshot
+GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped)
+{
+ GTM_Snapshot ret_snapshot = NULL;
+ CheckConnection();
+ if (conn)
+ ret_snapshot = get_snapshot(conn, gxid, canbe_grouped);
+ if (ret_snapshot == NULL)
+ {
+ CloseGTM();
+ InitGTM();
+ }
+ return ret_snapshot;
+}
+
+
+/**
+ * Create a sequence on the GTM.
+ *
+ *
+ */
+int CreateSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval,
+ GTM_Sequence maxval, GTM_Sequence startval, bool cycle)
+{
+ GTM_SequenceKeyData seqkey;
+ CheckConnection();
+ seqkey.gsk_keylen = strlen(seqname);
+ seqkey.gsk_key = seqname;
+
+ return conn ? open_sequence(conn, &seqkey, increment, minval, maxval, startval, cycle) : 0;
+}
+
+/**
+ * Get the next sequence value
+ */
+GTM_Sequence
+GetNextValGTM(char *seqname)
+{
+ GTM_Sequence ret = -1;
+ GTM_SequenceKeyData seqkey;
+ CheckConnection();
+ seqkey.gsk_keylen = strlen(seqname);
+ seqkey.gsk_key = seqname;
+
+ if (conn)
+ ret = get_next(conn, &seqkey);
+ if (ret < 0)
+ {
+ CloseGTM();
+ InitGTM();
+ }
+ return ret;
+}
+
+/**
+ * Drop the sequence
+ */
+int
+DropSequenceGTM(char *seqname)
+{
+ GTM_SequenceKeyData seqkey;
+ CheckConnection();
+ seqkey.gsk_keylen = strlen(seqname);
+ seqkey.gsk_key = seqname;
+
+ return conn ? close_sequence(conn, &seqkey) : -1;
+}
+
+
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 9c74e995db..2695085be3 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -21,6 +21,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.24 2009/01/01 17:23:36 momjian Exp $
*
@@ -34,6 +35,10 @@
#include "pg_trace.h"
#include "utils/snapmgr.h"
+#ifdef PGXC
+/* Check if there is about a 1 billion XID difference for XID wraparound */
+#define SUBTRANS_WRAP_CHECK_DELTA (2^30 / SUBTRANS_XACTS_PER_PAGE)
+#endif
/*
* Defines for SubTrans page sizes. A page is the same BLCKSZ as is used
@@ -307,11 +312,31 @@ ExtendSUBTRANS(TransactionId newestXact)
* No work except at first XID of a page. But beware: just after
* wraparound, the first XID of page zero is FirstNormalTransactionId.
*/
+#ifdef PGXC /* PGXC_COORD || PGXC_DATANODE */
+ /*
+ * In PGXC, it may be that a node is not involved in a transaction,
+ * and therefore will be skipped, so we need to detect this by using
+ * the latest_page_number instead of the pg index.
+ *
+ * Also, there is a special case of when transactions wrap-around that
+ * we need to detect.
+ */
+ pageno = TransactionIdToPage(newestXact);
+
+ /*
+ * The first condition makes sure we did not wrap around
+ * The second checks if we are still using the same page
+ */
+ if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA
+ && pageno <= SubTransCtl->shared->latest_page_number)
+ return;
+#else
if (TransactionIdToEntry(newestXact) != 0 &&
!TransactionIdEquals(newestXact, FirstNormalTransactionId))
return;
pageno = TransactionIdToPage(newestXact);
+#endif
LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 6de9c73f6e..4b9071f947 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.54 2009/06/25 19:05:52 heikki Exp $
@@ -68,7 +69,11 @@
#define TWOPHASE_DIR "pg_twophase"
/* GUC variable, can't be changed after startup */
+#ifdef PGXC
+int max_prepared_xacts = 10; /* We require 2PC */
+#else
int max_prepared_xacts = 0;
+#endif
/*
* This struct describes one global transaction that is in prepared state
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 029b2f2deb..4de1080544 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -4,6 +4,7 @@
* postgres OID & XID variables support routines
*
* Copyright (c) 2000-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.84 2009/04/23 00:23:45 tgl Exp $
@@ -21,6 +22,10 @@
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "utils/builtins.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+#endif
/* Number of OIDs to prefetch (preallocate) per XLOG write */
@@ -29,6 +34,40 @@
/* pointer to "variable cache" in shared memory (set up by shmem.c) */
VariableCache ShmemVariableCache = NULL;
+#ifdef PGXC /* PGXC_DATANODE */
+static TransactionId next_xid = InvalidTransactionId;
+static bool force_get_xid_from_gtm = false;
+
+/*
+ * Set next transaction id to use
+ */
+void
+SetNextTransactionId(TransactionId xid)
+{
+ elog (DEBUG1, "[re]setting xid = %d, old_value = %d", xid, next_xid);
+ next_xid = xid;
+}
+
+/*
+ * Allow force of getting XID from GTM
+ * Useful for explicit VACUUM (autovacuum already handled)
+ */
+void
+SetForceXidFromGTM(bool value)
+{
+ force_get_xid_from_gtm = value;
+}
+
+/*
+ * See if we should force using GTM
+ * Useful for explicit VACUUM (autovacuum already handled)
+ */
+bool
+GetForceXidFromGTM(void)
+{
+ return force_get_xid_from_gtm;
+}
+#endif /* PGXC */
/*
* Allocate the next XID for my new transaction or subtransaction.
@@ -39,6 +78,9 @@ TransactionId
GetNewTransactionId(bool isSubXact)
{
TransactionId xid;
+#ifdef PGXC
+ bool increment_xid = true;
+#endif
/*
* During bootstrap initialization, we return the special bootstrap
@@ -51,9 +93,100 @@ GetNewTransactionId(bool isSubXact)
return BootstrapTransactionId;
}
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Get XID from GTM before acquiring the lock.
+ * The rest of the code will handle it if after obtaining XIDs,
+ * the lock is acquired in a different order.
+ * This will help with GTM connection issues- we will not
+ * block all other processes.
+ */
+ xid = (TransactionId) BeginTranGTM();
+ }
+#endif
+
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ if (TransactionIdIsValid(xid))
+ {
+ if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid))
+ {
+ increment_xid = false;
+ ereport(DEBUG1,
+ (errmsg("xid (%d) was less than ShmemVariableCache->nextXid (%d)",
+ xid, ShmemVariableCache->nextXid)));
+ }
+ else
+ ShmemVariableCache->nextXid = xid;
+ }
+ else
+ {
+ ereport(WARNING,
+ (errmsg("Xid is invalid.")));
+
+ /* Problem is already reported, so just remove lock and return */
+ LWLockRelease(XidGenLock);
+ return xid;
+ }
+ } else if(IS_PGXC_DATANODE)
+ {
+ if (IsAutoVacuumWorkerProcess())
+ {
+ if (MyProc->vacuumFlags & PROC_IN_VACUUM)
+ {
+ elog (DEBUG1, "Getting XID for autovacuum");
+ /* Try and get gxid directly from GTM.
+ * We use a different function so that GTM knows to
+ * exclude it from other snapshots.
+ */
+ next_xid = (TransactionId) BeginTranAutovacuumGTM();
+ } else {
+ elog (DEBUG1, "Getting XID for autovacuum worker (analyze)");
+ /* try and get gxid directly from GTM */
+ next_xid = (TransactionId) BeginTranGTM();
+ }
+ } else if (GetForceXidFromGTM())
+ {
+ elog (DEBUG1, "Force get XID from GTM");
+ /* try and get gxid directly from GTM */
+ next_xid = (TransactionId) BeginTranGTM();
+ }
+
+ if (TransactionIdIsValid(next_xid))
+ {
+ xid = next_xid;
+ elog(DEBUG1, "TransactionId = %d", next_xid);
+ next_xid = InvalidTransactionId; /* reset */
+ if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid))
+ {
+ /* This should be ok, due to concurrency from multiple coords
+ * passing down the xids.
+ * We later do not want to bother incrementing the value
+ * in shared memory though.
+ */
+ increment_xid = false;
+ elog(DEBUG1, "xid (%d) does not follow ShmemVariableCache->nextXid (%d)",
+ xid, ShmemVariableCache->nextXid);
+ } else
+ ShmemVariableCache->nextXid = xid;
+ }
+ else
+ {
+ /* Fallback to default */
+ elog(LOG, "Falling back to local Xid. Was = %d, now is = %d",
+ next_xid, ShmemVariableCache->nextXid);
+ xid = ShmemVariableCache->nextXid;
+
+ }
+ }
+#else
xid = ShmemVariableCache->nextXid;
+#endif /* PGXC */
+
/*----------
* Check to see if it's safe to assign another XID. This protects against
@@ -98,7 +231,6 @@ GetNewTransactionId(bool isSubXact)
"You might also need to commit or roll back old prepared transactions.",
NameStr(ShmemVariableCache->limit_datname))));
}
-
/*
* If we are allocating the first XID of a new page of the commit log,
* zero out that commit-log page before returning. We must do this while
@@ -117,7 +249,13 @@ GetNewTransactionId(bool isSubXact)
* want the next incoming transaction to try it again. We cannot assign
* more XIDs until there is CLOG space for them.
*/
- TransactionIdAdvance(ShmemVariableCache->nextXid);
+#ifdef PGXC /* defined(PGXC_COORD) || defined(PGXC_DATANODE) */
+ /* We may not be at the max, which is ok. Do not bother to increment.
+ * We get this externally anyway, so it should not be needed in theory...
+ */
+ if (increment_xid)
+#endif
+ TransactionIdAdvance(ShmemVariableCache->nextXid);
/*
* We must store the new XID into the shared ProcArray before releasing
@@ -177,7 +315,6 @@ GetNewTransactionId(bool isSubXact)
}
LWLockRelease(XidGenLock);
-
return xid;
}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 2b6a222477..9ab3c70430 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -7,6 +7,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -20,6 +21,15 @@
#include <time.h>
#include <unistd.h>
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#include "pgxc/datanode.h"
+/* PGXC_DATANODE */
+#include "postmaster/autovacuum.h"
+#endif
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/transam.h"
@@ -51,7 +61,6 @@
#include "utils/snapmgr.h"
#include "pg_trace.h"
-
/*
* User-tweakable parameters
*/
@@ -125,6 +134,9 @@ typedef enum TBlockState
typedef struct TransactionStateData
{
TransactionId transactionId; /* my XID, or Invalid if none */
+#ifdef PGXC /* PGXC_COORD */
+ GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */
+#endif
SubTransactionId subTransactionId; /* my subxact ID */
char *name; /* savepoint name, if any */
int savepointLevel; /* savepoint level */
@@ -152,6 +164,9 @@ typedef TransactionStateData *TransactionState;
*/
static TransactionStateData TopTransactionStateData = {
0, /* transaction id */
+#ifdef PGXC
+ 0, /* global transaction id */
+#endif
0, /* subtransaction id */
NULL, /* savepoint name */
0, /* savepoint level */
@@ -274,6 +289,43 @@ static void ShowTransactionStateRec(TransactionState state);
static const char *BlockStateAsString(TBlockState blockState);
static const char *TransStateAsString(TransState state);
+#ifdef PGXC /* PGXC_COORD */
+static GlobalTransactionId GetGlobalTransactionId(TransactionState s);
+
+/* ----------------------------------------------------------------
+ * PG-XC Functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * GetCurrentGlobalTransactionId
+ *
+ * This will return the GXID of the current transaction,
+ * getting one from the GTM if it's not yet set. Be careful to call this
+ * only inside a valid xact.
+ */
+GlobalTransactionId
+GetCurrentGlobalTransactionId(void)
+{
+ return GetGlobalTransactionId(CurrentTransactionState);
+}
+
+/*
+ * GetGlobalTransactionId
+ *
+ * This will return the GXID of the specified transaction,
+ * getting one from the GTM if it's not yet set.
+ */
+static GlobalTransactionId
+GetGlobalTransactionId(TransactionState s)
+{
+ if (!GlobalTransactionIdIsValid(s->globalTransactionId))
+ s->globalTransactionId = (GlobalTransactionId) GetNewTransactionId(s->parent != NULL);
+
+ return s->globalTransactionId;
+}
+#endif /* PGXC */
+
/* ----------------------------------------------------------------
* transaction state accessors
@@ -364,6 +416,7 @@ GetCurrentTransactionId(void)
return s->transactionId;
}
+
/*
* GetCurrentTransactionIdIfAny
*
@@ -412,6 +465,15 @@ AssignTransactionId(TransactionState s)
* PG_PROC, the subtrans entry is needed to ensure that other backends see
* the Xid as "running". See GetNewTransactionId.
*/
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ s->transactionId = (TransactionId) GetGlobalTransactionId(s);
+ elog(DEBUG1, "New transaction id assigned = %d, isSubXact = %s",
+ s->transactionId, isSubXact ? "true" : "false");
+ }
+ else
+#endif
s->transactionId = GetNewTransactionId(isSubXact);
if (isSubXact)
@@ -1458,8 +1520,11 @@ StartTransaction(void)
* start processing
*/
s->state = TRANS_START;
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ s->globalTransactionId = InvalidGlobalTransactionId; /* until assigned */
+#endif
s->transactionId = InvalidTransactionId; /* until assigned */
-
/*
* Make sure we've reset xact state variables
*/
@@ -1629,7 +1694,24 @@ CommitTransaction(void)
latestXid = RecordTransactionCommit();
TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
-
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Make sure this committed on the DataNodes,
+ * if so it will just return
+ */
+ DataNodeCommit(DestNone);
+ CommitTranGTM(s->globalTransactionId);
+ }
+ else if (IS_PGXC_DATANODE)
+ {
+ /* If we are autovacuum, commit on GTM */
+ if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+ && IsGTMConnected())
+ CommitTranGTM((GlobalTransactionId) latestXid);
+ }
+#endif
+
/*
* Let others know about no transaction in progress by me. Note that this
* must be done _before_ releasing locks we hold and _after_
@@ -1725,6 +1807,13 @@ CommitTransaction(void)
s->nChildXids = 0;
s->maxChildXids = 0;
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ s->globalTransactionId = InvalidGlobalTransactionId;
+ else if (IS_PGXC_DATANODE)
+ SetNextTransactionId(InvalidTransactionId);
+#endif
+
/*
* done with commit processing, set current transaction state back to
* default
@@ -1959,6 +2048,10 @@ PrepareTransaction(void)
s->nChildXids = 0;
s->maxChildXids = 0;
+#ifdef PGXC /* PGXC_DATANODE */
+ if (IS_PGXC_DATANODE)
+ SetNextTransactionId(InvalidTransactionId);
+#endif
/*
* done with 1st phase commit processing, set current transaction state
* back to default
@@ -2045,7 +2138,23 @@ AbortTransaction(void)
latestXid = RecordTransactionAbort(false);
TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
-
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Make sure this is rolled back on the DataNodes,
+ * if so it will just return
+ */
+ DataNodeRollback(DestNone);
+ RollbackTranGTM(s->globalTransactionId);
+ }
+ else if (IS_PGXC_DATANODE)
+ {
+ /* If we are autovacuum, commit on GTM */
+ if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+ && IsGTMConnected())
+ RollbackTranGTM((GlobalTransactionId) latestXid);
+ }
+#endif
/*
* Let others know about no transaction in progress by me. Note that this
* must be done _before_ releasing locks we hold and _after_
@@ -2130,6 +2239,13 @@ CleanupTransaction(void)
s->nChildXids = 0;
s->maxChildXids = 0;
+#ifdef PGXC /* PGXC_DATANODE */
+ if (IS_PGXC_COORDINATOR)
+ s->globalTransactionId = InvalidGlobalTransactionId;
+ else if (IS_PGXC_DATANODE)
+ SetNextTransactionId(InvalidTransactionId);
+#endif
+
/*
* done with abort processing, set current transaction state back to
* default
@@ -4004,6 +4120,10 @@ PushTransaction(void)
* We can now stack a minimally valid subtransaction without fear of
* failure.
*/
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ s->globalTransactionId = InvalidGlobalTransactionId;
+#endif
s->transactionId = InvalidTransactionId; /* until assigned */
s->subTransactionId = currentSubTransactionId;
s->parent = p;
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 5a0f852b6f..969d6f566c 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.250 2009/02/18 15:58:41 heikki Exp $
@@ -42,6 +43,10 @@
#include "utils/ps_status.h"
#include "utils/tqual.h"
+#ifdef PGXC
+#include "pgxc/poolmgr.h"
+#endif
+
extern int optind;
extern char *optarg;
@@ -329,6 +334,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
switch (auxType)
{
+#ifdef PGXC /* PGXC_COORD */
+ case PoolerProcess:
+ statmsg = "pooler process";
+ break;
+#endif
case StartupProcess:
statmsg = "startup process";
break;
@@ -402,6 +412,13 @@ AuxiliaryProcessMain(int argc, char *argv[])
switch (auxType)
{
+#ifdef PGXC /* PGXC_COORD */
+ case PoolerProcess:
+ /* don't set signals, pool manager has its own agenda */
+ PoolManagerInit();
+ proc_exit(1); /* should never return */
+#endif
+
case CheckerProcess:
bootstrap_signals();
CheckerModeMain();
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index ed06048894..2693b426b1 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -12,8 +12,8 @@ include $(top_builddir)/src/Makefile.global
OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
pg_aggregate.o pg_constraint.o pg_conversion.o pg_depend.o pg_enum.o \
- pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o \
- pg_shdepend.o pg_type.o storage.o toasting.o
+ pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o pg_shdepend.o \
+ pg_type.o pgxc_class.o storage.o toasting.o
BKIFILES = postgres.bki postgres.description postgres.shdescription
@@ -37,6 +37,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\
pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
pg_ts_parser.h pg_ts_template.h \
pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
+ pgxc_class.h \
toasting.h indexing.h \
)
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 8181cae64a..2932bffd1d 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.89 2009/06/11 14:48:54 momjian Exp $
@@ -50,6 +51,9 @@
#include "catalog/pg_ts_template.h"
#include "catalog/pg_type.h"
#include "catalog/pg_user_mapping.h"
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#endif
#include "commands/comment.h"
#include "commands/dbcommands.h"
#include "commands/defrem.h"
@@ -144,6 +148,9 @@ static const Oid object_classes[MAX_OCLASS] = {
AuthIdRelationId, /* OCLASS_ROLE */
DatabaseRelationId, /* OCLASS_DATABASE */
TableSpaceRelationId /* OCLASS_TBLSPACE */
+#ifdef PGXC
+ ,PgxcClassRelationId /* OCLASS_PGXCCLASS */
+#endif
};
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index f4cf829b46..4f14113c3b 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -68,6 +69,11 @@
#include "utils/syscache.h"
#include "utils/tqual.h"
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#include "pgxc/locator.h"
+#endif
+
static void AddNewRelationTuple(Relation pg_class_desc,
Relation new_rel_desc,
@@ -775,6 +781,141 @@ AddNewRelationTuple(Relation pg_class_desc,
InsertPgClassTuple(pg_class_desc, new_rel_desc, new_rel_oid, reloptions);
}
+#ifdef PGXC
+/* --------------------------------
+ * AddRelationDistribution
+ *
+ * Add to pgxc_class table
+ * --------------------------------
+ */
+void
+AddRelationDistribution (Oid relid,
+ DistributeBy *distributeby,
+ List *parentOids,
+ TupleDesc descriptor)
+{
+ char locatortype = '\0';
+ int hashalgorithm = 0;
+ int hashbuckets = 0;
+ AttrNumber attnum = 0;
+
+
+ if (!distributeby)
+ {
+ /*
+ * No distribution specified.
+ * See if we are a child table, and get distribution information
+ * from there.
+ */
+ if (list_length(parentOids) > 1)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Cannot currently distribute a table with more than one parent.")));
+ }
+ else if (list_length(parentOids) == 1)
+ {
+ /*
+ * Use parent's distribution
+ */
+ int parentOid;
+ RelationLocInfo *rel_loc_info;
+
+ parentOid = linitial_oid(parentOids);
+ rel_loc_info = GetRelationLocInfo(parentOid);
+ locatortype = rel_loc_info->locatorType;
+
+ switch (locatortype)
+ {
+ case LOCATOR_TYPE_HASH:
+ attnum = rel_loc_info->partAttrNum;
+ break;
+
+ case LOCATOR_TYPE_REPLICATED:
+ case LOCATOR_TYPE_RROBIN:
+ break;
+
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("Invalid parent table distribution type")));
+ break;
+ }
+ } else
+ {
+ /*
+ * If no distribution was specified, and we have not chosen
+ * one based on primary key or foreign key, use first column with
+ * a supported data type.
+ */
+ Form_pg_attribute attr;
+ int i;
+
+ locatortype = LOCATOR_TYPE_HASH;
+
+ for (i = 0; i < descriptor->natts; i++)
+ {
+ attr = descriptor->attrs[i];
+ if (IsHashDistributable(attr->atttypid))
+ {
+ /* distribute on this column */
+ attnum = i + 1;
+ break;
+ }
+ }
+
+ /* If we did not find a usable type, fall back to round robin */
+ if (attnum == 0)
+ locatortype = LOCATOR_TYPE_RROBIN;
+ }
+ } else
+ {
+ /*
+ * User specified distribution type
+ */
+ switch (distributeby->disttype)
+ {
+ case DISTTYPE_HASH:
+ /* User specified hash column, validate */
+ attnum = get_attnum(relid, distributeby->colname);
+
+ if (!IsHashDistributable(descriptor->attrs[attnum-1]->atttypid))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Column %s is not a hash distributable data type",
+ distributeby->colname)));
+ }
+ locatortype = LOCATOR_TYPE_HASH;
+ break;
+
+ case DISTTYPE_REPLICATION:
+ locatortype = LOCATOR_TYPE_REPLICATED;
+ break;
+
+ case DISTTYPE_ROUNDROBIN:
+ locatortype = LOCATOR_TYPE_RROBIN;
+ break;
+
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("Invalid distribution type")));
+ }
+ }
+
+ if (locatortype == LOCATOR_TYPE_HASH)
+ {
+ /* PGXCTODO */
+ /* Use these for now until we make allowing different algorithms more flexible */
+ hashalgorithm = 1;
+ hashbuckets = HASH_SIZE;
+ }
+
+ PgxcClassCreate (relid, locatortype, attnum, hashalgorithm, hashbuckets);
+}
+#endif
+
/* --------------------------------
* AddNewRelationType -
diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c
new file mode 100644
index 0000000000..a77f242357
--- /dev/null
+++ b/src/backend/catalog/pgxc_class.c
@@ -0,0 +1,105 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgxc_class.c
+ * routines to support manipulation of the pgxc_class relation
+ *
+ * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pgxc_class.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "pgxc/locator.h"
+
+void PgxcClassCreate(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets)
+{
+ Relation pgxcclassrel;
+ HeapTuple htup;
+ bool nulls[Natts_pgxc_class];
+ Datum values[Natts_pgxc_class];
+ int i;
+
+ /* Iterate through edb_linkauth attributes initializing nulls and values */
+ for (i = 0; i < Natts_pgxc_class; i++)
+ {
+ nulls[i] = false;
+ values[i] = (Datum) 0;
+ }
+
+ /* should not happen */
+ if(pcrelid == InvalidOid)
+ {
+ elog(ERROR,"pgxc class relid invalid.");
+ return;
+ }
+
+ values[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
+ values[Anum_pgxc_class_pclocatortype - 1] = ObjectIdGetDatum(pclocatortype);
+
+ if (pclocatortype == LOCATOR_TYPE_HASH)
+ {
+ values[Anum_pgxc_class_pcattnum - 1] = ObjectIdGetDatum(pcattnum);
+ values[Anum_pgxc_class_pchashalgorithm - 1] = ObjectIdGetDatum(pchashalgorithm);
+ values[Anum_pgxc_class_pchashbuckets - 1] = ObjectIdGetDatum(pchashbuckets);
+ }
+
+ /* Open the edb_linkauth relation for insertion */
+ pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+
+ htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls);
+
+ (void) simple_heap_insert(pgxcclassrel, htup);
+
+ CatalogUpdateIndexes(pgxcclassrel, htup);
+
+ heap_close(pgxcclassrel, RowExclusiveLock);
+}
+
+#ifdef PGXC
+/*
+ * RemovePGXCClass():
+ *
+ * Remove extended PGXC information
+ *
+ * arg1: Oid of the relation.
+ *
+ */
+void RemovePgxcClass(Oid pcrelid)
+{
+ Relation relation;
+ HeapTuple tup;
+
+ /*
+ * Delete the pgxc_class tuple.
+ */
+ relation = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ tup = SearchSysCache(PGXCCLASSRELID,
+ ObjectIdGetDatum(pcrelid),
+ 0, 0, 0);
+
+ if (!HeapTupleIsValid(tup)) /* should not happen */
+ elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+
+ simple_heap_delete(relation, &tup->t_self);
+
+ ReleaseSysCache(tup);
+
+ heap_close(relation, RowExclusiveLock);
+}
+#endif /* PGXC */
+
+
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index c6a01f5b75..e0005905ba 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -37,6 +38,10 @@
#include "parser/parse_coerce.h"
#include "parser/parse_func.h"
#include "parser/parsetree.h"
+#ifdef PGXC
+#include "parser/parse_utilcmd.h"
+#include "pgxc/pgxc.h"
+#endif
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -404,6 +409,30 @@ DefineIndex(RangeVar *heapRelation,
(void) index_reloptions(amoptions, reloptions, true);
+#ifdef PGXC
+ /* Make sure we can locally enforce the index */
+ if (IS_PGXC_COORDINATOR && (primary || unique))
+ {
+ ListCell *elem;
+ bool isSafe = false;
+
+ foreach(elem, attributeList)
+ {
+ IndexElem *key = (IndexElem *) lfirst(elem);
+
+ if (CheckLocalIndexColumn(rel->rd_locator_info->locatorType,
+ rel->rd_locator_info->partAttrName, key->name))
+ {
+ isSafe = true;
+ break;
+ }
+ }
+ if (!isSafe)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash distribution column.")));
+ }
+#endif
/*
* Prepare arguments for index_create, primarily an IndexInfo structure.
* Note that ii_Predicate must be in implicit-AND format.
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index e6c75ab014..a187afa8f2 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -35,6 +36,12 @@
#include "utils/resowner.h"
#include "utils/syscache.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+/* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#include "access/gtm.h"
+#endif
/*
* We don't want to log each fetching of a value from a sequence,
@@ -117,6 +124,13 @@ DefineSequence(CreateSeqStmt *seq)
bool null[SEQ_COL_LASTCOL];
int i;
NameData name;
+#ifdef PGXC /* PGXC_COORD */
+ GTM_Sequence start_value = 1;
+ GTM_Sequence min_value = 1;
+ GTM_Sequence max_value = InvalidSequenceValue;
+ GTM_Sequence increment = 1;
+ bool cycle = false;
+#endif
/* Check and set all option values */
init_params(seq->options, true, &new, &owned_by);
@@ -155,21 +169,33 @@ DefineSequence(CreateSeqStmt *seq)
coldef->typename = makeTypeNameFromOid(INT8OID, -1);
coldef->colname = "start_value";
value[i - 1] = Int64GetDatumFast(new.start_value);
+#ifdef PGXC /* PGXC_COORD */
+ start_value = new.start_value;
+#endif
break;
case SEQ_COL_INCBY:
coldef->typename = makeTypeNameFromOid(INT8OID, -1);
coldef->colname = "increment_by";
value[i - 1] = Int64GetDatumFast(new.increment_by);
+#ifdef PGXC /* PGXC_COORD */
+ increment = new.increment_by;
+#endif
break;
case SEQ_COL_MAXVALUE:
coldef->typename = makeTypeNameFromOid(INT8OID, -1);
coldef->colname = "max_value";
value[i - 1] = Int64GetDatumFast(new.max_value);
+#ifdef PGXC /* PGXC_COORD */
+ max_value = new.max_value;
+#endif
break;
case SEQ_COL_MINVALUE:
coldef->typename = makeTypeNameFromOid(INT8OID, -1);
coldef->colname = "min_value";
value[i - 1] = Int64GetDatumFast(new.min_value);
+#ifdef PGXC /* PGXC_COORD */
+ min_value = new.min_value;
+#endif
break;
case SEQ_COL_CACHE:
coldef->typename = makeTypeNameFromOid(INT8OID, -1);
@@ -185,6 +211,9 @@ DefineSequence(CreateSeqStmt *seq)
coldef->typename = makeTypeNameFromOid(BOOLOID, -1);
coldef->colname = "is_cycled";
value[i - 1] = BoolGetDatum(new.is_cycled);
+#ifdef PGXC /* PGXC_COORD */
+ cycle = new.is_cycled;
+#endif
break;
case SEQ_COL_CALLED:
coldef->typename = makeTypeNameFromOid(BOOLOID, -1);
@@ -308,6 +337,20 @@ DefineSequence(CreateSeqStmt *seq)
process_owned_by(rel, owned_by);
heap_close(rel, NoLock);
+
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* We also need to create it on the GTM */
+ if (CreateSequenceGTM(name.data, increment, min_value, max_value,
+ start_value, cycle) < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not create sequence")));
+ }
+ }
+#endif
}
/*
@@ -481,6 +524,20 @@ nextval_internal(Oid relid)
seq = read_info(elm, seqrel, &buf);
page = BufferGetPage(buf);
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Above, we still use the page as a locking mechanism to handle
+ * concurrency
+ */
+ result = (int64) GetNextValGTM(RelationGetRelationName(seqrel));
+ if (result < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not obtain sequence value")));
+ } else
+ {
+#endif
last = next = result = seq->last_value;
incby = seq->increment_by;
maxv = seq->max_value;
@@ -636,7 +693,9 @@ nextval_internal(Oid relid)
seq->log_cnt = log; /* how much is logged */
END_CRIT_SECTION();
-
+#ifdef PGXC /* PGXC_COORD */
+ }
+#endif
UnlockReleaseBuffer(buf);
relation_close(seqrel, NoLock);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index f22e7be5d8..3372883714 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -76,6 +77,10 @@
#include "utils/syscache.h"
#include "utils/tqual.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+#endif
/*
* ON COMMIT action list
@@ -523,6 +528,18 @@ DefineRelation(CreateStmt *stmt, char relkind)
*/
CommandCounterIncrement();
+#ifdef PGXC
+ /*
+ * Add to pgxc_class.
+ * we need to do this after CommandCounterIncrement
+ */
+ if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION)
+ {
+ AddRelationDistribution (relationId, stmt->distributeby, inheritOids, descriptor);
+ CommandCounterIncrement();
+ }
+#endif
+
/*
* Open the new relation and acquire exclusive lock on it. This isn't
* really necessary for locking out other backends (since they can't see
@@ -739,6 +756,16 @@ RemoveRelations(DropStmt *drop)
add_exact_object_address(&obj, objects);
+
+#ifdef PGXC /* PGXC_COORD */
+ /* PGXCTODO: allow the ability to rollback dropping sequences. */
+
+ /* Drop the sequence */
+ if (IS_PGXC_COORDINATOR && classform->relkind == RELKIND_SEQUENCE)
+ {
+ DropSequenceGTM(rel->relname);
+ }
+#endif
ReleaseSysCache(tuple);
}
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 732f6d09c3..aed98d98f8 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -10,6 +10,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -57,6 +58,9 @@
#include "utils/syscache.h"
#include "utils/tqual.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
/*
* GUC parameters
@@ -899,6 +903,18 @@ vac_update_datfrozenxid(void)
if (dirty)
{
database_file_update_needed();
+ /*
+ * vac_truncate_clog needs a transaction id to detect wrap-arounds. For
+ * a autovacuum, this would require the data node to contact the GTM or
+ * the coordinator and acquire GXID for the vacuum operation.
+ *
+ * To avoid this complexity, we disable the CLOG truncation. This is
+ * perfectly fine for the prototype because we are not handling GXID
+ * wrap-around in the prototype anyways. In future, this should be
+ * fixed either by acquiring GXID for the vacuum operation or by
+ * modifying the wrap-around check logic such that it does not need a
+ * GXID
+ */
vac_truncate_clog(newFrozenXid);
}
}
@@ -1026,7 +1042,8 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
if (scanned_all)
*scanned_all = false;
-
+#ifndef PGXC
+ /* In PG-XC, do these after setting vacuum flags */
/* Begin a transaction for vacuuming this relation */
StartTransactionCommand();
@@ -1035,6 +1052,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
* ensures that RecentGlobalXmin is kept truly recent.
*/
PushActiveSnapshot(GetTransactionSnapshot());
+#endif
if (!vacstmt->full)
{
@@ -1065,6 +1083,19 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
LWLockRelease(ProcArrayLock);
}
+#ifdef PGXC
+ elog (DEBUG1, "Starting vacuum transaction");
+ /* In PG-XC, do these after setting vacuum flags */
+ /* Begin a transaction for vacuuming this relation */
+ StartTransactionCommand();
+ elog (DEBUG1, "Started vacuum transaction");
+
+ /*
+ * Functions in indexes may want a snapshot set. Also, setting
+ * a snapshot ensures that RecentGlobalXmin is kept truly recent.
+ */
+ PushActiveSnapshot(GetTransactionSnapshot());
+#endif
/*
* Check for user-requested abort. Note we want this to be inside a
* transaction, so xact.c doesn't issue useless WARNING.
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 72c9877ffd..895e65e121 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -13,6 +13,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.432 2009/06/18 01:27:02 tgl Exp $
@@ -2402,6 +2403,19 @@ _copyCopyStmt(CopyStmt *from)
return newnode;
}
+#ifdef PGXC
+static DistributeBy *
+_copyDistributeBy(DistributeBy *from)
+{
+ DistributeBy *newnode = makeNode(DistributeBy);
+
+ COPY_SCALAR_FIELD(disttype);
+ COPY_STRING_FIELD(colname);
+
+ return newnode;
+}
+#endif
+
static CreateStmt *
_copyCreateStmt(CreateStmt *from)
{
@@ -2414,6 +2428,9 @@ _copyCreateStmt(CreateStmt *from)
COPY_NODE_FIELD(options);
COPY_SCALAR_FIELD(oncommit);
COPY_STRING_FIELD(tablespacename);
+#ifdef PGXC
+ COPY_NODE_FIELD(distributeby);
+#endif
return newnode;
}
@@ -4093,7 +4110,11 @@ copyObject(void *from)
case T_XmlSerialize:
retval = _copyXmlSerialize(from);
break;
-
+#ifdef PGXC
+ case T_DistributeBy:
+ retval = _copyDistributeBy(from);
+ break;
+#endif
default:
elog(ERROR, "unrecognized node type: %d", (int) nodeTag(from));
retval = from; /* keep compiler quiet */
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 041b96971c..fedb5102bb 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -20,6 +20,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.355 2009/06/18 01:27:02 tgl Exp $
@@ -1078,6 +1079,9 @@ _equalCreateStmt(CreateStmt *a, CreateStmt *b)
COMPARE_NODE_FIELD(options);
COMPARE_SCALAR_FIELD(oncommit);
COMPARE_STRING_FIELD(tablespacename);
+#ifdef PGXC
+ COMPARE_NODE_FIELD(distributeby);
+#endif
return true;
}
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index b27cd513a5..98d3c4c9ef 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -1154,6 +1155,22 @@ _readRangeTblEntry(void)
READ_DONE();
}
+#ifdef PGXC
+/*
+ * _readDistributeBy
+ */
+static DistributeBy *
+_readDistributeBy(void)
+{
+ READ_LOCALS(DistributeBy);
+
+ READ_ENUM_FIELD(disttype, DistributionType);
+ READ_STRING_FIELD(colname);
+
+ READ_DONE();
+}
+#endif
+
/*
* parseNodeString
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 280443074f..9ffada513a 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -8,6 +8,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -58,6 +59,7 @@
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "parser/gramparse.h"
+#include "pgxc/poolmgr.h"
#include "storage/lmgr.h"
#include "utils/date.h"
#include "utils/datetime.h"
@@ -179,6 +181,9 @@ static TypeName *TableFuncTypeName(List *columns);
InsertStmt *istmt;
VariableSetStmt *vsetstmt;
+/* PGXC_BEGIN */
+ DistributeBy *distby;
+/* PGXC_END */
}
%type <node> stmt schema_stmt
@@ -197,7 +202,7 @@ static TypeName *TableFuncTypeName(List *columns);
DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt
DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt
DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt
- DropForeignServerStmt DropUserMappingStmt ExplainStmt FetchStmt
+ DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
GrantStmt GrantRoleStmt IndexStmt InsertStmt ListenStmt LoadStmt
LockStmt NotifyStmt ExplainableStmt PreparableStmt
CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt
@@ -250,7 +255,7 @@ static TypeName *TableFuncTypeName(List *columns);
%type <str> relation_name copy_file_name
database_name access_method_clause access_method attr_name
- index_name name file_name cluster_index_specification
+ index_name name file_name cluster_index_specification
%type <list> func_name handler_name qual_Op qual_all_Op subquery_Op
opt_class opt_validator validator_clause
@@ -323,6 +328,9 @@ static TypeName *TableFuncTypeName(List *columns);
%type <boolean> opt_freeze opt_default opt_recheck
%type <defelt> opt_binary opt_oids copy_delimiter
+%type <list> node_list
+%type <str> DirectStmt
+
%type <boolean> copy_from
%type <ival> opt_column event cursor_options opt_hold opt_set_data
@@ -415,6 +423,9 @@ static TypeName *TableFuncTypeName(List *columns);
%type <windef> window_definition over_clause window_specification
%type <str> opt_existing_window_name
%type <ival> opt_frame_clause frame_extent frame_bound
+/* PGXC_BEGIN */
+%type <distby> OptDistributeBy
+/* PGXC_END */
/*
@@ -425,6 +436,7 @@ static TypeName *TableFuncTypeName(List *columns);
*/
/* ordinary key words in alphabetical order */
+/* PGXC - added REPLICATION, DISTRIBUTE, and HASH */
%token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER
AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC
ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION
@@ -436,14 +448,17 @@ static TypeName *TableFuncTypeName(List *columns);
CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
CLUSTER COALESCE COLLATE COLUMN COMMENT COMMIT
COMMITTED CONCURRENTLY CONFIGURATION CONNECTION CONSTRAINT CONSTRAINTS
- CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE CREATEDB
+ CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE CREATEDB
CREATEROLE CREATEUSER CROSS CSV CURRENT_P
CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA
CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE
DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS
DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DESC
- DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P DOUBLE_P DROP
+/* PGXC_BEGIN */
+ DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTRIBUTE DO DOCUMENT_P DOMAIN_P DOUBLE_P
+/* PGXC_END */
+ DROP
EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EXCEPT
EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTERNAL EXTRACT
@@ -453,7 +468,9 @@ static TypeName *TableFuncTypeName(List *columns);
GLOBAL GRANT GRANTED GREATEST GROUP_P
- HANDLER HAVING HEADER_P HOLD HOUR_P
+/* PGXC_BEGIN */
+ HANDLER HASH HAVING HEADER_P HOLD HOUR_P
+/* PGXC_END */
IDENTITY_P IF_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IN_P
INCLUDING INCREMENT INDEX INDEXES INHERIT INHERITS INITIALLY
@@ -471,7 +488,7 @@ static TypeName *TableFuncTypeName(List *columns);
MAPPING MATCH MAXVALUE MINUTE_P MINVALUE MODE MONTH_P MOVE
NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NOCREATEDB
- NOCREATEROLE NOCREATEUSER NOINHERIT NOLOGIN_P NONE NOSUPERUSER
+ NOCREATEROLE NOCREATEUSER NODE NOINHERIT NOLOGIN_P NONE NOSUPERUSER
NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC
OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
@@ -484,8 +501,10 @@ static TypeName *TableFuncTypeName(List *columns);
QUOTE
RANGE READ REAL REASSIGN RECHECK RECURSIVE REFERENCES REINDEX
- RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART
- RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE
+/* PGXC_BEGIN */
+ RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA REPLICATION RESET RESTART
+ RESTRICT RETURNING RETURNS REVOKE RIGHT ROBIN ROLE ROLLBACK ROUND ROW ROWS RULE
+/* PGXC_END */
SAVEPOINT SCHEMA SCROLL SEARCH SECOND_P SECURITY SELECT SEQUENCE
SERIALIZABLE SERVER SESSION SESSION_USER SET SETOF SHARE
@@ -668,6 +687,7 @@ stmt :
| DropUserMappingStmt
| DropdbStmt
| ExecuteStmt
+ | ExecDirectStmt
| ExplainStmt
| FetchStmt
| GrantStmt
@@ -2036,7 +2056,10 @@ opt_using:
*****************************************************************************/
CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
- OptInherit OptWith OnCommitOption OptTableSpace
+ OptInherit OptWith OnCommitOption OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy
+/* PGXC_END */
{
CreateStmt *n = makeNode(CreateStmt);
$4->istemp = $2;
@@ -2047,10 +2070,21 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
n->options = $9;
n->oncommit = $10;
n->tablespacename = $11;
+ n->distributeby = $12;
+/* PGXC_BEGIN */
+ if (n->inhRelations != NULL && n->distributeby != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ scanner_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
$$ = (Node *)n;
}
| CREATE OptTemp TABLE qualified_name OF qualified_name
- '(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace
+ '(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy
+/* PGXC_END */
{
/* SQL99 CREATE TABLE OF <UDT> (cols) seems to be satisfied
* by our inheritance capabilities. Let's try it...
@@ -2064,6 +2098,14 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
n->options = $10;
n->oncommit = $11;
n->tablespacename = $12;
+ n->distributeby = $13;
+/* PGXC_BEGIN */
+ if (n->inhRelations != NULL && n->distributeby != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ scanner_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
$$ = (Node *)n;
}
;
@@ -2495,6 +2537,36 @@ OptTableSpace: TABLESPACE name { $$ = $2; }
| /*EMPTY*/ { $$ = NULL; }
;
+/* PGXC_BEGIN */
+DistributeByHash: DISTRIBUTE BY
+ | DISTRIBUTE BY HASH
+ ;
+
+OptDistributeBy: DistributeByHash '(' name ')'
+ {
+ DistributeBy *n = makeNode(DistributeBy);
+ n->disttype = DISTTYPE_HASH;
+ n->colname = $3;
+ $$ = n;
+ }
+ | DISTRIBUTE BY REPLICATION
+ {
+ DistributeBy *n = makeNode(DistributeBy);
+ n->disttype = DISTTYPE_REPLICATION;
+ n->colname = NULL;
+ $$ = n;
+ }
+ | DISTRIBUTE BY ROUND ROBIN
+ {
+ DistributeBy *n = makeNode(DistributeBy);
+ n->disttype = DISTTYPE_ROUNDROBIN;
+ n->colname = NULL;
+ $$ = n;
+ }
+ | /*EMPTY*/ { $$ = NULL; }
+ ;
+/* PGXC_END */
+
OptConsTableSpace: USING INDEX TABLESPACE name { $$ = $4; }
| /*EMPTY*/ { $$ = NULL; }
;
@@ -6461,6 +6533,47 @@ opt_analyze:
/*****************************************************************************
*
* QUERY:
+ * EXECUTE DIRECT ON (COORDINATOR | NODE num, ...) query
+ *
+ *****************************************************************************/
+
+ExecDirectStmt: EXECUTE DIRECT ON COORDINATOR DirectStmt
+ {
+ ExecDirectStmt *n = makeNode(ExecDirectStmt);
+ n->coordinator = TRUE;
+ n->nodes = NIL;
+ n->query = $5;
+ $$ = (Node *)n;
+ }
+ | EXECUTE DIRECT ON NODE node_list DirectStmt
+ {
+ ExecDirectStmt *n = makeNode(ExecDirectStmt);
+ n->coordinator = FALSE;
+ n->nodes = $5;
+ n->query = $6;
+ $$ = (Node *)n;
+ }
+ ;
+
+DirectStmt:
+ Sconst /* by default all are $$=$1 */
+ ;
+
+node_list:
+ Iconst { $$ = list_make1(makeInteger($1)); }
+ | node_list ',' Iconst { $$ = lappend($1, makeInteger($3)); }
+ | '*'
+ {
+ int i;
+ $$ = NIL;
+ for (i=1; i<=NumDataNodes; i++)
+ $$ = lappend($$, makeInteger(i));
+ }
+ ;
+
+/*****************************************************************************
+ *
+ * QUERY:
* PREPARE <plan_name> [(args, ...)] AS <query>
*
*****************************************************************************/
@@ -10117,6 +10230,7 @@ ColLabel: IDENT { $$ = $1; }
/* "Unreserved" keywords --- available for use as any kind of name.
*/
+/* PGXC - added DISTRIBUTE, HASH, REPLICATION */
unreserved_keyword:
ABORT_P
| ABSOLUTE_P
@@ -10157,6 +10271,7 @@ unreserved_keyword:
| CONTENT_P
| CONTINUE_P
| CONVERSION_P
+ | COORDINATOR
| COPY
| COST
| CREATEDB
@@ -10178,8 +10293,12 @@ unreserved_keyword:
| DELIMITER
| DELIMITERS
| DICTIONARY
+ | DIRECT
| DISABLE_P
| DISCARD
+/* PGXC_BEGIN */
+ | DISTRIBUTE
+/* PGXC_END */
| DOCUMENT_P
| DOMAIN_P
| DOUBLE_P
@@ -10204,6 +10323,9 @@ unreserved_keyword:
| GLOBAL
| GRANTED
| HANDLER
+/* PGXC_BEGIN */
+ | HASH
+/* PGXC_END */
| HEADER_P
| HOLD
| HOUR_P
@@ -10253,6 +10375,7 @@ unreserved_keyword:
| NOCREATEDB
| NOCREATEROLE
| NOCREATEUSER
+ | NODE
| NOINHERIT
| NOLOGIN_P
| NOSUPERUSER
@@ -10294,13 +10417,22 @@ unreserved_keyword:
| REPEATABLE
| REPLACE
| REPLICA
+/* PGXC_BEGIN */
+ | REPLICATION
+/* PGXC_END */
| RESET
| RESTART
| RESTRICT
| RETURNS
| REVOKE
+/* PGXC_BEGIN */
+ | ROBIN
+/* PGXC_END */
| ROLE
| ROLLBACK
+/* PGXC_BEGIN */
+ | ROUND
+/* PGXC_END */
| ROWS
| RULE
| SAVEPOINT
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index e5a3621cce..1336e00a45 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -18,6 +18,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/backend/parser/parse_utilcmd.c,v 2.21 2009/06/11 14:49:00 momjian Exp $
*
@@ -48,6 +49,11 @@
#include "parser/parse_relation.h"
#include "parser/parse_type.h"
#include "parser/parse_utilcmd.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#include "pgxc/pgxc.h"
+#endif
+
#include "rewrite/rewriteManip.h"
#include "utils/acl.h"
#include "utils/builtins.h"
@@ -75,6 +81,10 @@ typedef struct
List *alist; /* "after list" of things to do after creating
* the table */
IndexStmt *pkey; /* PRIMARY KEY index, if any */
+#ifdef PGXC
+ char *fallback_dist_col; /* suggested column to distribute on */
+ DistributeBy *distributeby; /* original distribute by column in create table */
+#endif
} CreateStmtContext;
/* State shared by transformCreateSchemaStmt and its subroutines */
@@ -114,7 +124,9 @@ static void transformFKConstraints(ParseState *pstate,
static void transformConstraintAttrs(List *constraintList);
static void transformColumnType(ParseState *pstate, ColumnDef *column);
static void setSchemaName(char *context_schema, char **stmt_schema_name);
-
+#ifdef PGXC
+static void checkLocalFKConstraints(CreateStmtContext *cxt);
+#endif
/*
* transformCreateStmt -
@@ -177,6 +189,10 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
cxt.alist = NIL;
cxt.pkey = NULL;
cxt.hasoids = interpretOidsOption(stmt->options);
+#ifdef PGXC
+ cxt.fallback_dist_col = NULL;
+ cxt.distributeby = stmt->distributeby;
+#endif
/*
* Run through each primary element in the table creation clause. Separate
@@ -244,6 +260,18 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
result = list_concat(result, cxt.alist);
result = list_concat(result, save_alist);
+#ifdef PGXC
+ /*
+ * If the user did not specify any distribution clause and there is no
+ * inherits clause, try and use PK or unique index
+ */
+ if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col)
+ {
+ stmt->distributeby = (DistributeBy *) palloc0(sizeof(DistributeBy));
+ stmt->distributeby->disttype = DISTTYPE_HASH;
+ stmt->distributeby->colname = cxt.fallback_dist_col;
+ }
+#endif
return result;
}
@@ -307,7 +335,7 @@ transformColumnDefinition(ParseState *pstate, CreateStmtContext *cxt,
char *snamespace;
char *sname;
char *qstring;
- A_Const *snamenode;
+ A_Const *snamenode;
TypeCast *castnode;
FuncCall *funccallnode;
CreateSeqStmt *seqstmt;
@@ -1061,6 +1089,7 @@ transformIndexConstraints(ParseState *pstate, CreateStmtContext *cxt)
}
}
+
/*
* transformIndexConstraint
* Transform one UNIQUE or PRIMARY KEY constraint for
@@ -1072,6 +1101,10 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
IndexStmt *index;
ListCell *keys;
IndexElem *iparam;
+#ifdef PGXC
+ bool isLocalSafe = false;
+#endif
+
index = makeNode(IndexStmt);
@@ -1126,6 +1159,22 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
if (strcmp(column->colname, key) == 0)
{
found = true;
+
+#ifdef PGXC
+ /*
+ * Only allow locally enforceable constraints.
+ * See if it is a distribution column
+ * If not set, set it to first column in index.
+ * If primary key, we prefer that over a unique constraint.
+ */
+ if (IS_PGXC_COORDINATOR && !isLocalSafe)
+ {
+ if (cxt->distributeby)
+ isLocalSafe = CheckLocalIndexColumn (
+ ConvertToLocatorType(cxt->distributeby->disttype),
+ cxt->distributeby->colname, key);
+ }
+#endif
break;
}
}
@@ -1219,6 +1268,27 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
}
}
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ /*
+ * Set fallback distribution column.
+ * If not set, set it to first column in index.
+ * If primary key, we prefer that over a unique constraint.
+ */
+ if (index->indexParams == NIL
+ && (index->primary || !cxt->fallback_dist_col))
+ {
+ cxt->fallback_dist_col = pstrdup(key);
+ }
+
+ /* Existing table, check if it is safe */
+ if (!cxt->distributeby && !isLocalSafe)
+ isLocalSafe = CheckLocalIndexColumn (
+ cxt->rel->rd_locator_info->locatorType, cxt->rel->rd_locator_info->partAttrName, key);
+ }
+#endif
+
/* OK, add it to the index definition */
iparam = makeNode(IndexElem);
iparam->name = pstrdup(key);
@@ -1228,6 +1298,13 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
iparam->nulls_ordering = SORTBY_NULLS_DEFAULT;
index->indexParams = lappend(index->indexParams, iparam);
}
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR && cxt->distributeby
+ && cxt->distributeby->disttype == DISTTYPE_HASH && !isLocalSafe)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash distribution column.")));
+#endif
return index;
}
@@ -1256,9 +1333,34 @@ transformFKConstraints(ParseState *pstate, CreateStmtContext *cxt,
FkConstraint *fkconstraint = (FkConstraint *) lfirst(fkclist);
fkconstraint->skip_validation = true;
+#ifdef PGXC
+ /*
+ * Set fallback distribution column.
+ * If not yet set, set it to first column in FK constraint
+ * if it references a partitioned table
+ */
+ if (IS_PGXC_COORDINATOR && !cxt->fallback_dist_col)
+ {
+ Oid pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false);
+
+ /* make sure it is a partitioned column */
+ if (IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,0))))
+ {
+ /* take first column */
+ char *colstr = strdup(strVal(list_nth(fkconstraint->fk_attrs,0)));
+ cxt->fallback_dist_col = pstrdup(colstr);
+ }
+ }
+#endif
}
}
+#ifdef PGXC
+ /* Only allow constraints that are locally enforceable - no distributed ones */
+ if (IS_PGXC_COORDINATOR)
+ checkLocalFKConstraints(cxt);
+#endif
+
/*
* For CREATE TABLE or ALTER TABLE ADD COLUMN, gin up an ALTER TABLE ADD
* CONSTRAINT command to execute after the basic command is complete. (If
@@ -1714,6 +1816,10 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString)
cxt.blist = NIL;
cxt.alist = NIL;
cxt.pkey = NULL;
+#ifdef PGXC
+ cxt.fallback_dist_col = NULL;
+ cxt.distributeby = NULL;
+#endif
/*
* The only subtypes that currently require parse transformation handling
@@ -2115,3 +2221,118 @@ setSchemaName(char *context_schema, char **stmt_schema_name)
"different from the one being created (%s)",
*stmt_schema_name, context_schema)));
}
+
+#ifdef PGXC
+/*
+ * CheckLocalIndexColumn
+ *
+ * Checks whether or not the index can be safely enforced locally
+ */
+bool
+CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
+{
+
+ if (loctype == LOCATOR_TYPE_REPLICATED)
+ /* always safe */
+ return true;
+ if (loctype == LOCATOR_TYPE_RROBIN)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
+ else if (loctype == LOCATOR_TYPE_HASH)
+ {
+ if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * check to see if the constraint can be enforced locally
+ * if not, an error will be thrown
+ */
+void
+static checkLocalFKConstraints(CreateStmtContext *cxt)
+{
+ ListCell *fkclist;
+
+ foreach(fkclist, cxt->fkconstraints)
+ {
+ FkConstraint *fkconstraint;
+ Oid pk_rel_id;
+ char refloctype;
+ char *checkcolname = NULL;
+
+ fkconstraint = (FkConstraint *) lfirst(fkclist);
+ pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false);
+
+ refloctype = GetLocatorType(pk_rel_id);
+
+ /* If referenced table is replicated, the constraint is safe */
+ if (refloctype == LOCATOR_TYPE_REPLICATED)
+ continue;
+ else if (refloctype == LOCATOR_TYPE_RROBIN)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a round robin table in a foreign key constraint")));
+ }
+
+ /*
+ * See if we are hash partitioned and the column appears in the
+ * constraint, and it corresponds to the position in the referenced table.
+ */
+ if (cxt->isalter)
+ {
+ if (cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_HASH)
+ {
+ checkcolname = cxt->rel->rd_locator_info->partAttrName;
+ }
+ }
+ else
+ {
+ if (cxt->distributeby)
+ {
+ if (cxt->distributeby->disttype == DISTTYPE_HASH)
+ checkcolname = cxt->distributeby->colname;
+ }
+ else
+ {
+ if (cxt->fallback_dist_col)
+ checkcolname = cxt->fallback_dist_col;
+ }
+ }
+
+ if (checkcolname)
+ {
+ int pos = 0;
+
+ ListCell *attritem;
+
+ foreach(attritem, fkconstraint->fk_attrs)
+ {
+ char *attrname = (char *) strVal(lfirst(attritem));
+
+ if (strcmp(cxt->rel->rd_locator_info->partAttrName, attrname) == 0)
+ {
+ /* Found the ordinal position in constraint */
+ break;
+ }
+ pos++;
+ }
+
+ if (pos >= list_length(fkconstraint->fk_attrs))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash distributed table must include distribution column in index")));
+
+ /* Verify that the referenced table is partitioned at the same position in the index */
+ if (!IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,pos))))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash distribution column does not refer to hash distribution column in referenced table.")));
+ }
+ }
+}
+#endif
diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile
new file mode 100644
index 0000000000..d978720b1c
--- /dev/null
+++ b/src/backend/pgxc/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the access methods module
+#
+#
+# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# $PostgreSQL$
+#
+
+subdir = src/backend/pgxc
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+SUBDIRS = locator plan pool
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/locator/Makefile b/src/backend/pgxc/locator/Makefile
new file mode 100644
index 0000000000..026a247940
--- /dev/null
+++ b/src/backend/pgxc/locator/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for locator
+#
+#
+# Copyright(C) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/locator
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = locator.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
new file mode 100644
index 0000000000..995a64cb4a
--- /dev/null
+++ b/src/backend/pgxc/locator/locator.c
@@ -0,0 +1,607 @@
+/*-------------------------------------------------------------------------
+ *
+ * locator.c
+ * Functions that help manage table location information such as
+ * partitioning and replication information.
+ *
+ *
+ * PGXCTODO - do not use a single mappingTable for all
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "postgres.h"
+#include "access/skey.h"
+#include "access/relscan.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_type.h"
+#include "nodes/pg_list.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/tqual.h"
+#include "pgxc/poolmgr.h"
+#include "pgxc/locator.h"
+
+#include "catalog/pgxc_class.h"
+#include "catalog/namespace.h"
+
+
+/* PGXCTODO For prototype, relations use the same hash mapping table.
+ * Long term, make it a pointer in RelationLocInfo, and have
+ * similarly handled tables point to the same mapping table,
+ * to check faster for equivalency
+ */
+int mappingTable[HASH_SIZE];
+
+bool locatorInited = false;
+
+
+/* GUC parameter */
+char *PreferredDataNodes = NULL;
+
+/* Preferred to use when reading from replicated tables */
+static List *globalPreferredNodes = NIL;
+
+/*
+ * init_mapping_table - initializes a mapping table
+ *
+ * PGXCTODO
+ * For the prototype, all partitioned tables will use the same partition map.
+ * We cannot assume this long term
+ */
+static void
+init_mapping_table(int nodeCount, int mapTable[])
+{
+ int i;
+
+ for (i = 0; i < HASH_SIZE; i++)
+ {
+ mapTable[i] = (i % nodeCount) + 1;
+ }
+}
+
+
+/*
+ * Pick any data node, but try a preferred node
+ *
+ */
+int
+GetAnyDataNode(void)
+{
+ /* try and pick from the preferred list */
+ if (globalPreferredNodes != NULL)
+ return linitial_int(globalPreferredNodes);
+
+ return 1;
+}
+
+
+/*
+ * hash_range - hash the key to a value between 0 and HASH_SIZE
+ *
+ * Note, this function corresponds to GridSQL hashing
+ * and is used here to allow us the wire up GridSQL
+ * to the same underlying nodes
+ */
+static int
+hash_range(char *key)
+{
+ int i;
+ int length;
+ int value;
+
+ if (key == NULL || key == '\0')
+ {
+ return 0;
+ }
+
+ length = strlen(key);
+
+ value = 0x238F13AF * length;
+
+ for (i = 0; i < length; i++)
+ {
+ value = value + ((key[i] << i * 5 % 24) & 0x7fffffff);
+ }
+
+ return (1103515243 * value + 12345) % 65537 & HASH_MASK;
+}
+
+/*
+ * hash_range_int - hashes the integer key to a value between 0 and HASH_SIZE
+ *
+ * See hash_range
+ */
+static int
+hash_range_int(int intkey)
+{
+ char int_str[13]; /* plenty for 32 bit int */
+
+ int_str[12] = '\0';
+ snprintf(int_str, 12, "%d", intkey);
+
+ return hash_range(int_str);
+}
+
+
+/*
+ * get_node_from_hash - determine node based on hash bucket
+ *
+ */
+static int
+get_node_from_hash(int hash)
+{
+ if (hash > HASH_SIZE || hash < 0)
+ {
+ ereport(ERROR, (errmsg("Hash value out of range\n")));
+ }
+
+ return mappingTable[hash];
+}
+
+
+/*
+ * Returns whether or not the data type is hash distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
+ */
+bool
+IsHashDistributable(Oid col_type)
+{
+ if (col_type == INT4OID || col_type == INT2OID)
+ return true;
+
+ return false;
+}
+
+
+/*
+ * get_hash_column - return hash column for relation.
+ *
+ * Returns NULL if the relation is not hash partitioned.
+ */
+char *
+GetRelationHashColumn(RelationLocInfo * rel_loc_info)
+{
+ char *column_str = NULL;
+
+ if (rel_loc_info == NULL)
+ column_str = NULL;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ column_str = NULL;
+ else
+ {
+ int len = strlen(rel_loc_info->partAttrName);
+
+ column_str = (char *) palloc(len + 1);
+ strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ }
+
+ return column_str;
+}
+
+/*
+ * IsHashColumn - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name)
+{
+ bool ret_value = false;
+
+ if (!rel_loc_info || !part_col_name)
+ ret_value = false;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ ret_value = false;
+ else
+ ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+
+ return ret_value;
+}
+
+
+/*
+ * IsHashColumnForRelId - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumnForRelId(Oid relid, char *part_col_name)
+{
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+ return IsHashColumn(rel_loc_info, part_col_name);
+}
+
+
+/**
+ * Update the round robin node for the relation
+ *
+ * PGXCTODO - may not want to bother with locking here, we could track
+ * these in the session memory context instead...
+ */
+int
+GetRoundRobinNode(Oid relid)
+{
+ int ret_node;
+
+ Relation rel = relation_open(relid, AccessShareLock);
+
+ Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED ||
+ rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+
+ ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
+
+ /* Move round robin indicator to next node */
+ if (rel->rd_locator_info->roundRobinNode->next != NULL)
+ rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
+ else
+ /* reset to first one */
+ rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head;
+
+ relation_close(rel, AccessShareLock);
+
+ return ret_node;
+}
+
+
+/*
+ * GetRelationNodes
+ *
+ * Get list of relation nodes for read operation.
+ * If the table is replicated and we are reading, we can just pick one.
+ * If the table is partitioned, we apply partitioning column value, if possible.
+ *
+ * If the relation is partitioned, partValue will be applied if present
+ * (indicating a value appears for partitioning column), otherwise it
+ * is ignored.
+ *
+ * preferredNodes is only used when for replicated tables. If set, it will
+ * use one of the nodes specified if the table is replicated on it.
+ * This helps optimize for avoiding introducing additional nodes into the
+ * transaction.
+ *
+ * The returned List is a copy, so it should be freed when finished.
+ */
+List *
+GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue, int isRead)
+{
+ ListCell *prefItem;
+ ListCell *stepItem;
+ List *destList = NULL;
+
+
+ if (rel_loc_info == NULL)
+ return NULL;
+
+ switch (rel_loc_info->locatorType)
+ {
+ case LOCATOR_TYPE_REPLICATED:
+
+ if (!isRead)
+ /* we need to write to all synchronously */
+ destList = list_copy(rel_loc_info->nodeList);
+ else
+ {
+ destList = NULL;
+
+ if (globalPreferredNodes != NULL)
+ {
+ /* try and pick from the preferred list */
+ foreach(prefItem, globalPreferredNodes)
+ /* make sure it is valid for this relation */
+ foreach(stepItem, rel_loc_info->nodeList)
+ if (lfirst_int(stepItem) == lfirst_int(prefItem))
+ {
+ destList = lappend_int(NULL, lfirst_int(prefItem));
+ break;
+ }
+ }
+ }
+
+ if (destList == NULL)
+ {
+ /*
+ * read from just one of them
+ * use round robin mechanism
+ */
+ destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid));
+ }
+ break;
+
+ case LOCATOR_TYPE_HASH:
+
+ if (partValue != NULL)
+ {
+ /* in prototype, all partitioned tables use same map */
+ destList = lappend_int(NULL, get_node_from_hash(hash_range_int(*partValue)));
+ }
+ else
+ {
+ /*
+ * No partitioning value passed in
+ * (no where qualification on part column - use all)
+ */
+ destList = list_copy(rel_loc_info->nodeList);
+ }
+ break;
+
+ case LOCATOR_TYPE_SINGLE:
+
+ /* just return first (there should only be one) */
+ destList = list_copy(rel_loc_info->nodeList);
+ break;
+
+ case LOCATOR_TYPE_RROBIN:
+
+ /* round robin, get next one */
+ if (isRead)
+ {
+ /* we need to read from all */
+ destList = list_copy(rel_loc_info->nodeList);
+ }
+ else
+ {
+ /* write to just one of them */
+ destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid));
+ }
+
+ break;
+
+ /* PGXCTODO case LOCATOR_TYPE_RANGE: */
+ /* PGXCTODO case LOCATOR_TYPE_CUSTOM: */
+ default:
+ ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
+ rel_loc_info->locatorType)));
+ break;
+ }
+
+ return destList;
+}
+
+
+/*
+ * ConvertToLocatorType
+ * get locator distribution type
+ * We really should just have pgxc_class use disttype instead...
+ */
+char
+ConvertToLocatorType(int disttype)
+{
+ char loctype;
+
+ switch (disttype)
+ {
+ case DISTTYPE_HASH:
+ loctype = LOCATOR_TYPE_HASH;
+ break;
+ case DISTTYPE_ROUNDROBIN:
+ loctype = LOCATOR_TYPE_RROBIN;
+ break;
+ case DISTTYPE_REPLICATION:
+ loctype = LOCATOR_TYPE_REPLICATED;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Invalid distribution type")));
+ break;
+ }
+
+ return loctype;
+}
+
+
+/*
+ * GetLocatorType - Returns the locator type of the table
+ *
+ */
+char
+GetLocatorType(Oid relid)
+{
+ char ret = '\0';
+
+ RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
+
+ if (ret_loc_info != NULL)
+ ret = ret_loc_info->locatorType;
+
+ return ret;
+}
+
+
+/*
+ * Return a list of all nodes.
+ * We assume all tables use all nodes in the prototype, so just return a list
+ * from first one.
+ */
+List *
+GetAllNodes(void)
+{
+ int i;
+
+ /*
+ * PGXCTODO - add support for having nodes on a subset of nodes
+ * For now, assume on all nodes
+ */
+ List *nodeList = NIL;
+
+ for (i = 1; i < NumDataNodes + 1; i++)
+ {
+ nodeList = lappend_int(nodeList, i);
+ }
+
+ return nodeList;
+}
+
+
+/**
+ * Build locator information associated with the specified relation.
+ *
+ */
+void
+RelationBuildLocator(Relation rel)
+{
+ Relation pcrel;
+ ScanKeyData skey;
+ SysScanDesc pcscan;
+ HeapTuple htup;
+ MemoryContext oldContext;
+ RelationLocInfo *relationLocInfo;
+ int i;
+ int offset;
+ Form_pgxc_class pgxc_class;
+
+
+ /** PGXCTODO temporarily use the same mapping table for all
+ * Use all nodes.
+ */
+ if (!locatorInited)
+ {
+ init_mapping_table(NumDataNodes, mappingTable);
+ locatorInited = true;
+ }
+
+ ScanKeyInit(&skey,
+ Anum_pgxc_class_pcrelid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(RelationGetRelid(rel)));
+
+ pcrel = heap_open(PgxcClassRelationId, AccessShareLock);
+ pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true,
+ SnapshotNow, 1, &skey);
+ htup = systable_getnext(pcscan);
+
+ if (!HeapTupleIsValid(htup))
+ {
+ /* Assume local relation only */
+ rel->rd_locator_info = NULL;
+ systable_endscan(pcscan);
+ heap_close(pcrel, AccessShareLock);
+ return;
+ }
+
+ pgxc_class = (Form_pgxc_class) GETSTRUCT(htup);
+
+ oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+
+ relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo));
+ rel->rd_locator_info = relationLocInfo;
+
+ relationLocInfo->relid = RelationGetRelid(rel);
+ relationLocInfo->locatorType = pgxc_class->pclocatortype;
+
+ relationLocInfo->partAttrNum = pgxc_class->pcattnum;
+
+ relationLocInfo->partAttrName = get_attname(relationLocInfo->relid,
+ pgxc_class->pcattnum);
+
+ /** PGXCTODO - add support for having nodes on a subset of nodes
+ * For now, assume on all nodes
+ */
+ relationLocInfo->nodeList = GetAllNodes();
+ relationLocInfo->nodeCount = relationLocInfo->nodeList->length;
+
+ /*
+ * If the locator type is round robin, we set a node to
+ * use next time. In addition, if it is replicated,
+ * we choose a node to use for balancing reads.
+ */
+ if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
+ || relationLocInfo->locatorType == LOCATOR_TYPE_REPLICATED)
+ {
+ /*
+ * pick a random one to start with,
+ * since each process will do this independently
+ */
+ srand(time(NULL));
+ offset = rand() % relationLocInfo->nodeCount + 1;
+ relationLocInfo->roundRobinNode = relationLocInfo->nodeList->head; /* initialize */
+
+ for (i = 0; i < offset && relationLocInfo->roundRobinNode->next != NULL; i++)
+ {
+ relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next;
+ }
+ }
+
+ systable_endscan(pcscan);
+ heap_close(pcrel, AccessShareLock);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * GetLocatorRelationInfo - Returns the locator information for relation,
+ * in a copy of the RelationLocatorInfo struct in relcache
+ *
+ */
+RelationLocInfo *
+GetRelationLocInfo(Oid relid)
+{
+ RelationLocInfo *ret_loc_info = NULL;
+
+ Relation rel = relation_open(relid, AccessShareLock);
+
+ if (rel && rel->rd_locator_info)
+ ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
+
+ relation_close(rel, AccessShareLock);
+
+ return ret_loc_info;
+}
+
+/**
+ * Copy the RelationLocInfo struct
+ */
+RelationLocInfo *
+CopyRelationLocInfo(RelationLocInfo * src_info)
+{
+ RelationLocInfo *dest_info;
+
+
+ Assert(src_info);
+
+ dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
+
+ dest_info->relid = src_info->relid;
+ dest_info->locatorType = src_info->locatorType;
+ dest_info->partAttrNum = src_info->partAttrNum;
+ if (src_info->partAttrName)
+ dest_info->partAttrName = pstrdup(src_info->partAttrName);
+ dest_info->nodeCount = src_info->nodeCount;
+ if (src_info->nodeList)
+ dest_info->nodeList = list_copy(src_info->nodeList);
+
+ /* Note, for round robin, we use the relcache entry */
+
+ return dest_info;
+}
+
+
+/**
+ * Free RelationLocInfo struct
+ */
+void
+FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
+{
+ if (relationLocInfo)
+ {
+ if (relationLocInfo->partAttrName)
+ pfree(relationLocInfo->partAttrName);
+ pfree(relationLocInfo);
+ }
+}
diff --git a/src/backend/pgxc/plan/Makefile b/src/backend/pgxc/plan/Makefile
new file mode 100644
index 0000000000..c0e65741f1
--- /dev/null
+++ b/src/backend/pgxc/plan/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for rewrite
+#
+#
+# Portions Copyright(C) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/plan
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = planner.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
new file mode 100644
index 0000000000..90703c4896
--- /dev/null
+++ b/src/backend/pgxc/plan/planner.c
@@ -0,0 +1,1290 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.c
+ *
+ * Functions for generating a PGXC style plan.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "catalog/pg_type.h"
+#include "nodes/parsenodes.h"
+#include "pgxc/locator.h"
+#include "pgxc/planner.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * Convenient format for literal comparisons
+ *
+ * PGXCTODO - make constant type Datum, handle other types
+ */
+typedef struct
+{
+ Oid relid;
+ RelationLocInfo *rel_loc_info;
+ Oid attrnum;
+ char *col_name;
+ long constant; /* assume long PGXCTODO - should be Datum */
+} Literal_Comparison;
+
+/*
+ * This struct helps us detect special conditions to determine what nodes
+ * to execute on.
+ */
+typedef struct
+{
+ List *partitioned_literal_comps; /* List of Literal_Comparison */
+ List *partitioned_parent_child;
+ List *replicated_joins;
+
+ /*
+ * Used when joining a single replicated or non-replicated table with
+ * other replicated tables. Use as a basis for partitioning determination.
+ */
+ char *base_rel_name;
+ RelationLocInfo *base_rel_loc_info;
+
+} Special_Conditions;
+
+/* If two relations are joined based on special location information */
+typedef enum PGXCJoinType
+{
+ JOIN_REPLICATED,
+ JOIN_COLOCATED_PARTITIONED,
+ JOIN_OTHER
+} PGXCJoinType;
+
+/* used to track which tables are joined */
+typedef struct
+{
+ int relid1; /* the first relation */
+ char *aliasname1;
+ int relid2; /* the second relation */
+ char *aliasname2;
+
+ PGXCJoinType join_type;
+} PGXC_Join;
+
+/* A list of List*'s, one for each relation. */
+List *join_list = NULL;
+
+/* Forbid unsafe SQL statements */
+bool StrictStatementChecking = true;
+
+/* Forbid multi-node SELECT statements with an ORDER BY clause */
+bool StrictSelectChecking = false;
+
+/*
+ * Create a new join struct for tracking how relations are joined
+ */
+static PGXC_Join *
+new_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+ PGXC_Join *pgxc_join = (PGXC_Join *) palloc(sizeof(PGXC_Join));
+
+ if (relid1 < relid2)
+ {
+ pgxc_join->relid1 = relid1;
+ pgxc_join->relid2 = relid2;
+ pgxc_join->aliasname1 = aliasname1;
+ pgxc_join->aliasname2 = aliasname2;
+ }
+ else
+ {
+ pgxc_join->relid1 = relid2;
+ pgxc_join->relid2 = relid1;
+ pgxc_join->aliasname1 = aliasname2;
+ pgxc_join->aliasname2 = aliasname1;
+ }
+
+ pgxc_join->join_type = JOIN_OTHER;
+
+ return pgxc_join;
+}
+
+
+/*
+ * Look up the join struct for a particular join
+ */
+static PGXC_Join *
+find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+ ListCell *lc;
+
+ /* return if list is still empty */
+ if (join_list == NULL)
+ return NULL;
+
+ /* in the PGXC_Join struct, we always sort with relid1 < relid2 */
+ if (relid2 < relid1)
+ {
+ int tmp = relid1;
+ char *tmpalias = aliasname1;
+
+ relid1 = relid2;
+ aliasname1 = aliasname2;
+ relid2 = tmp;
+ aliasname2 = tmpalias;
+ }
+
+ /*
+ * there should be a small number, so we just search linearly, although
+ * long term a hash table would be better.
+ */
+ foreach(lc, join_list)
+ {
+ PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc);
+
+ if (pgxcjoin->relid1 == relid1 && pgxcjoin->relid2 == relid2
+ && !strcmp(pgxcjoin->aliasname1, aliasname1)
+ && !strcmp(pgxcjoin->aliasname2, aliasname2))
+ return pgxcjoin;
+ }
+ return NULL;
+}
+
+/*
+ * Find or create a join between 2 relations
+ */
+static PGXC_Join *
+find_or_create_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+ PGXC_Join *pgxcjoin;
+
+ pgxcjoin = find_pgxc_join(relid1, aliasname1, relid2, aliasname2);
+
+ if (pgxcjoin == NULL)
+ {
+ pgxcjoin = new_pgxc_join(relid1, aliasname1, relid2, aliasname2);
+ join_list = lappend(join_list, pgxcjoin);
+ }
+
+ return pgxcjoin;
+}
+
+
+/*
+ * new_special_conditions - Allocate Special_Conditions struct and initialize
+ */
+static Special_Conditions *
+new_special_conditions()
+{
+ Special_Conditions *special_conditions =
+ (Special_Conditions *) palloc0(sizeof(Special_Conditions));
+
+ return special_conditions;
+}
+
+/*
+ * free Special_Conditions struct
+ */
+static void
+free_special_relations(Special_Conditions * special_conditions)
+{
+ if (special_conditions == NULL)
+ return;
+
+ /* free all items in list, including Literal_Comparison struct */
+ list_free_deep(special_conditions->partitioned_literal_comps);
+
+ /* free list, but not items pointed to */
+ list_free(special_conditions->partitioned_parent_child);
+ list_free(special_conditions->replicated_joins);
+
+ pfree(special_conditions);
+}
+
+/*
+ * frees join_list
+ */
+static void
+free_join_list()
+{
+ if (join_list == NULL)
+ return;
+
+ /* free all items in list including PGXC_Join struct */
+ list_free_deep(join_list);
+}
+
+/*
+ * get_numeric_constant - extract casted constant
+ *
+ * Searches an expression to see if it is a Constant that is being cast
+ * to numeric. Return a pointer to the Constant, or NULL.
+ * We need this because of casting.
+ */
+static Expr *
+get_numeric_constant(Expr *expr)
+{
+
+ if (expr == NULL)
+ return NULL;
+
+ if (IsA(expr, Const))
+ return expr;
+
+ /* We may have a cast, represented by a function */
+ if (IsA(expr, FuncExpr))
+ {
+ FuncExpr *funcexpr = (FuncExpr *) expr;
+
+ /* try and get at what is being cast */
+ /* We may have an implicit double-cast, so we do this recurisvely */
+ if (funcexpr->funcid == F_NUMERIC || funcexpr->funcid == F_INT4_NUMERIC)
+ {
+ return get_numeric_constant(linitial(funcexpr->args));
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * get_base_var_table_and_column - determine the base table and column
+ *
+ * This is required because a RangeTblEntry may actually be another
+ * type, like a join, and we need to then look at the joinaliasvars
+ * to determine what the base table and column really is.
+ */
+static Var *
+get_base_var(Var * var, List *rtables)
+{
+ RangeTblEntry *rte;
+
+ /* get the RangeTableEntry */
+ rte = list_nth(rtables, var->varno - 1);
+
+ if (rte->rtekind == RTE_RELATION)
+ return var;
+ else if (rte->rtekind == RTE_JOIN)
+ {
+ Var *colvar = list_nth(rte->joinaliasvars, var->varattno - 1);
+
+ /* continue resolving recursively */
+ return get_base_var(colvar, rtables);
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+
+/*
+ * get_plan_nodes_insert - determine nodes on which to execute insert.
+ */
+static List *
+get_plan_nodes_insert(Query * query)
+{
+ RangeTblEntry *rte;
+ RelationLocInfo *rel_loc_info;
+ Const *constant;
+ List *nodelist;
+ ListCell *lc;
+ long part_value;
+ long *part_value_ptr = NULL;
+
+
+ nodelist = NULL;
+
+ /* Looks complex (correlated?) - best to skip */
+ if (query->jointree != NULL && query->jointree->fromlist != NULL)
+ return NULL;
+
+ /* Make sure there is just one table */
+ if (query->rtable == NULL || query->rtable->length != 1)
+ return NULL;
+
+ rte = (RangeTblEntry *) lfirst(list_head(query->rtable));
+
+ if (rte != NULL && rte->rtekind != RTE_RELATION)
+ /* Bad relation type */
+ return NULL;
+
+ /* See if we have the partitioned case. */
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+
+ if (!rel_loc_info)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Could not find relation for oid = %d", rte->relid))));
+
+ if (rel_loc_info->locatorType == LOCATOR_TYPE_HASH
+ && rel_loc_info->partAttrName != NULL)
+ {
+ /* It is a partitioned table, get value by looking in targetList */
+ foreach(lc, query->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+
+ /*
+ * See if we have a constant expression comparing against the
+ * designated partitioned column
+ */
+ if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0)
+ {
+ /* We may have a cast, try and handle it */
+ Expr *checkexpr = get_numeric_constant(tle->expr);
+
+ if (checkexpr == NULL)
+ break; /* no constant */
+
+ constant = (Const *) checkexpr;
+
+ if (constant->consttype == INT4OID
+ || constant->consttype == INT2OID
+ || constant->consttype == INT8OID)
+ {
+ part_value = (long) constant->constvalue;
+ part_value_ptr = &part_value;
+
+ }
+ /* PGXCTODO - handle other data types */
+ /*
+ else
+ if (constant->consttype == VARCHAR ...
+ */
+ }
+ }
+ }
+
+ /* single call handles both replicated and partitioned types */
+ nodelist = GetRelationNodes(rel_loc_info, part_value_ptr, false);
+
+ return nodelist;
+}
+
+
+/*
+ * examine_conditions
+ *
+ * Examine conditions and find special ones to later help us determine
+ * what tables can be joined together. Put findings in Special_Conditions
+ * struct.
+ *
+ * Get list of constant comparisons conditions on partitioned column
+ * Get list of parent-child joins (partitioned together)
+ * Get list of joins with replicated tables
+ *
+ * If we encounter a cross-node join, we stop processing and return false,
+ * otherwise true.
+ *
+ * PGXCTODO: Recognize subqueries, and give up (long term allow safe ones).
+ *
+ */
+static bool
+examine_conditions(Special_Conditions * conditions, List *rtables, Node *expr_node)
+{
+ char *rel_name,
+ *rel_name2;
+ char *col_name,
+ *col_name2;
+ RelationLocInfo *rel_loc_info1,
+ *rel_loc_info2;
+ Const *constant;
+ Expr *checkexpr;
+
+
+ if (expr_node == NULL)
+ return true;
+
+ if (rtables == NULL)
+ return true;
+
+ if (conditions == NULL)
+ conditions = new_special_conditions();
+
+ if (IsA(expr_node, BoolExpr))
+ {
+ BoolExpr *boolexpr = (BoolExpr *) expr_node;
+
+ /* Recursively handle ANDed expressions, but don't handle others */
+ if (boolexpr->boolop == AND_EXPR)
+ {
+ if (!examine_conditions(conditions, rtables,
+ linitial(boolexpr->args)))
+ return false;
+
+ return examine_conditions(
+ conditions, rtables, lsecond(boolexpr->args));
+ }
+ else if (boolexpr->boolop == OR_EXPR)
+ {
+ /*
+ * look at OR's as work-around for reported issue.
+ * NOTE: THIS IS NOT CORRECT, BUT JUST DONE FOR THE PROTOTYPE.
+ * More rigorous
+ * checking needs to be done. PGXCTODO: Add careful checking for
+ * OR'ed conditions...
+ */
+ if (!examine_conditions(conditions, rtables,
+ linitial(boolexpr->args)))
+ return false;
+
+ return examine_conditions(
+ conditions, rtables, lsecond(boolexpr->args));
+ }
+ else
+ /* looks complicated, give up */
+ return false;
+
+ return true;
+ }
+
+
+ if (IsA(expr_node, OpExpr))
+ {
+ OpExpr *opexpr = (OpExpr *) expr_node;
+
+ /* See if we can equijoin these */
+ if (op_mergejoinable(opexpr->opno) && opexpr->args->length == 2)
+ {
+ Expr *arg1 = linitial(opexpr->args);
+ Expr *arg2 = lsecond(opexpr->args);
+
+ /* Look for a table */
+ if (IsA(arg1, Var))
+ {
+ RangeTblEntry *rte1,
+ *rte2;
+
+ /* get the RangeTableEntry */
+ Var *colvar = (Var *) arg1;
+
+ colvar = get_base_var(colvar, rtables);
+
+ if (!colvar)
+ return false;
+
+ rte1 = list_nth(rtables, colvar->varno - 1);
+
+ rel_name = get_rel_name(rte1->relid);
+ col_name = strVal(list_nth(rte1->eref->colnames,
+ colvar->varattno - 1));
+
+ /* Look at other argument */
+
+ /* We may have a cast, try and handle it */
+ checkexpr = get_numeric_constant(arg2);
+
+ if (checkexpr != NULL)
+ arg2 = checkexpr;
+
+ if (IsA(arg2, Const))
+ {
+ /* We have column = literal. Check if partitioned case */
+ constant = (Const *) arg2;
+
+ rel_loc_info1 = GetRelationLocInfo(rte1->relid);
+
+ if (!rel_loc_info1)
+ return false;
+
+ /* If hash partitioned, check if the part column was used */
+ if (IsHashColumn(rel_loc_info1, col_name))
+ {
+ /* add to partitioned literal join conditions */
+ Literal_Comparison *lit_comp =
+ palloc(sizeof(Literal_Comparison));
+
+ lit_comp->relid = rte1->relid;
+ lit_comp->rel_loc_info = rel_loc_info1;
+ lit_comp->col_name = col_name;
+ lit_comp->constant = constant->constvalue;
+
+ conditions->partitioned_literal_comps = lappend(
+ conditions->partitioned_literal_comps,
+ lit_comp);
+
+ return true;
+ }
+ else
+ {
+ /* unimportant comparison, just return */
+ if (rel_loc_info1)
+ FreeRelationLocInfo(rel_loc_info1);
+ return true;
+ }
+
+ }
+ else if (IsA(arg2, Var))
+ {
+ PGXC_Join *pgxc_join;
+ Var *colvar2 = (Var *) arg2;
+
+ rel_loc_info1 = GetRelationLocInfo(rte1->relid);
+
+ if (!rel_loc_info1)
+ return false;
+
+ colvar2 = get_base_var(colvar2, rtables);
+ if (!colvar2)
+ return false;
+ rte2 = list_nth(rtables, colvar2->varno - 1);
+ rel_name2 = get_rel_name(rte2->relid);
+ rel_loc_info2 = GetRelationLocInfo(rte2->relid);
+
+ /* get data struct about these two relations joining */
+ pgxc_join = find_or_create_pgxc_join(rte1->relid, rte1->eref->aliasname,
+ rte2->relid, rte2->eref->aliasname);
+
+ /*
+ * pgxc_join->condition_list =
+ * lappend(pgxc_join->condition_list, opexpr);
+ */
+
+ if (rel_loc_info1->locatorType == LOCATOR_TYPE_REPLICATED)
+ {
+ /* add to replicated join conditions */
+ conditions->replicated_joins =
+ lappend(conditions->replicated_joins, opexpr);
+
+ if (rel_loc_info2->locatorType != LOCATOR_TYPE_REPLICATED)
+ {
+ /* Note other relation, saves us work later. */
+ conditions->base_rel_name = rel_name2;
+ conditions->base_rel_loc_info = rel_loc_info2;
+ if (rel_loc_info1)
+ FreeRelationLocInfo(rel_loc_info1);
+ }
+
+ if (conditions->base_rel_name == NULL)
+ {
+ conditions->base_rel_name = rel_name;
+ conditions->base_rel_loc_info = rel_loc_info1;
+ if (rel_loc_info2)
+ FreeRelationLocInfo(rel_loc_info2);
+ }
+
+ /* note nature of join between the two relations */
+ pgxc_join->join_type = JOIN_REPLICATED;
+ return true;
+ }
+
+ if (rel_loc_info2->locatorType == LOCATOR_TYPE_REPLICATED)
+ {
+ /* add to replicated join conditions */
+ conditions->replicated_joins =
+ lappend(conditions->replicated_joins, opexpr);
+
+ /* other relation not replicated, note it for later */
+ conditions->base_rel_name = rel_name;
+ conditions->base_rel_loc_info = rel_loc_info1;
+
+ /* note nature of join between the two relations */
+ pgxc_join->join_type = JOIN_REPLICATED;
+
+ if (rel_loc_info2)
+ FreeRelationLocInfo(rel_loc_info2);
+
+ return true;
+ }
+
+ /* Now check for a partitioned join */
+
+ /*
+ * PGXCTODO - for the prototype, we assume all partitioned
+ * tables are on the same nodes.
+ */
+ col_name2 = strVal(list_nth(rte2->eref->colnames,
+ colvar2->varattno - 1));
+
+ if (IsHashColumn(rel_loc_info1, col_name)
+ && IsHashColumn(rel_loc_info2, col_name2))
+ {
+ /* We found a partitioned join */
+ conditions->partitioned_parent_child =
+ lappend(conditions->partitioned_parent_child,
+ opexpr);
+ pgxc_join->join_type = JOIN_COLOCATED_PARTITIONED;
+ return true;
+ }
+
+ /*
+ * At this point, there is some other type of join that
+ * can probably not be executed on only a single node.
+ * Just return. Important: We preserve previous
+ * pgxc_join->join_type value, there may be multiple
+ * columns joining two tables, and we want to make sure at
+ * least one of them make it colocated partitioned, in
+ * which case it will update it when examining another
+ * condition.
+ */
+ return true;
+ }
+ else
+ return true;
+
+ }
+ }
+ /* PGXCTODO - need to more finely examine other operators */
+ }
+
+ return true;
+}
+
+/*
+ * examine_conditions_fromlist - Examine FROM clause for joins
+ *
+ * Examine FROM clause join conditions to determine special conditions
+ * to help us decide which nodes to execute on.
+ */
+static bool
+examine_conditions_fromlist(Special_Conditions * conditions, List *rtables,
+ Node *treenode)
+{
+
+ if (treenode == NULL)
+ return true;
+
+ if (rtables == NULL)
+ return true;
+
+ if (conditions == NULL)
+ conditions = new_special_conditions();
+
+ if (IsA(treenode, JoinExpr))
+ {
+ JoinExpr *joinexpr = (JoinExpr *) treenode;
+
+ /* recursively examine FROM join tree */
+ if (!examine_conditions_fromlist(conditions, rtables, joinexpr->larg))
+ return false;
+
+ if (!examine_conditions_fromlist(conditions, rtables, joinexpr->rarg))
+ return false;
+
+ /* Now look at join condition */
+ if (!examine_conditions(conditions, rtables, joinexpr->quals))
+ return false;
+ return true;
+ }
+ else if (IsA(treenode, RangeTblRef))
+ {
+ return true;
+ }
+ else if (IsA(treenode, BoolExpr) ||IsA(treenode, OpExpr))
+ {
+ /* check base condition, if possible */
+ if (!examine_conditions(conditions, rtables, treenode))
+ return false;
+ }
+
+ /* Some other more complicated beast */
+ return false;
+}
+
+
+/*
+ * get_plan_nodes - determine the nodes to execute the command on.
+ *
+ * Examines the "special" query conditions in determining execution node list.
+ *
+ * returns NULL if it appears to be a mutli-step query.
+ */
+static List *
+get_plan_nodes(Query_Plan * query_plan, Query * query, bool isRead)
+{
+ RangeTblEntry *rte;
+ List *test_nodelist;
+ List *nodelist;
+ ListCell *lc,
+ *item;
+ Special_Conditions *special_conditions;
+ OpExpr *opexpr;
+ Var *colvar;
+ RelationLocInfo *rel_loc_info;
+
+
+ nodelist = NULL;
+ join_list = NULL;
+
+ /* If no tables, just return */
+ if (query->rtable == NULL && query->jointree == NULL)
+ return NULL;
+
+ /* Alloc and init struct */
+ special_conditions = new_special_conditions();
+
+ /* Look for special conditions */
+
+ /* Look for JOIN syntax joins */
+ foreach(item, query->jointree->fromlist)
+ {
+ Node *treenode = (Node *) lfirst(item);
+
+ if (IsA(treenode, JoinExpr))
+ {
+ if (!examine_conditions_fromlist(special_conditions, query->rtable,
+ treenode))
+ {
+ /* if too complicated, just return NULL */
+ free_special_relations(special_conditions);
+ free_join_list();
+ return NULL;
+ }
+ }
+ else if (!IsA(treenode, RangeTblRef))
+ {
+ /* could be complicated */
+ free_special_relations(special_conditions);
+ free_join_list();
+ return NULL;
+ }
+ }
+
+
+ /* Examine the WHERE clause, too */
+ if (!examine_conditions(special_conditions, query->rtable,
+ query->jointree->quals))
+ {
+ /* if cross joins may exist, just return NULL */
+ free_special_relations(special_conditions);
+ free_join_list();
+ return NULL;
+ }
+
+ /* Examine join conditions, see if each join is single-node safe */
+ if (join_list != NULL)
+ {
+ foreach(lc, join_list)
+ {
+ PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc);
+
+ /* If it is not replicated or parent-child, not single-node safe */
+ if (pgxcjoin->join_type == JOIN_OTHER)
+ {
+ free_special_relations(special_conditions);
+ free_join_list();
+ return NULL;
+ }
+ }
+ }
+
+
+ /* check for non-partitioned cases */
+ if (special_conditions->partitioned_parent_child == NULL &&
+ special_conditions->partitioned_literal_comps == NULL)
+ {
+ if (special_conditions->replicated_joins == NULL
+ && (query->rtable == NULL || query->rtable->length > 1))
+
+ /*
+ * This is too complicated for a single step, or there is no FROM
+ * clause
+ */
+ nodelist = NULL;
+ else
+ {
+ /*
+ * We have either a single table, just replicated tables, or a
+ * table that just joins with replicated tables.
+ */
+
+ /* See if we noted a table earlier to use */
+ rel_loc_info = special_conditions->base_rel_loc_info;
+
+ if (rel_loc_info == NULL)
+ {
+ /* a single table, just grab it */
+ rte = (RangeTblEntry *) linitial(query->rtable);
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+
+ if (!rel_loc_info)
+ return false;
+ }
+
+ nodelist = GetRelationNodes(rel_loc_info, NULL, isRead);
+ }
+ }
+ /* check for partitioned col comparison against a literal */
+ else if (special_conditions->partitioned_literal_comps != NULL
+ && special_conditions->partitioned_literal_comps->length > 0)
+ {
+ nodelist = NULL;
+
+ /*
+ * Make sure that if there are multiple such comparisons, that they
+ * are all on the same nodes.
+ */
+ foreach(lc, special_conditions->partitioned_literal_comps)
+ {
+ Literal_Comparison *lit_comp = (Literal_Comparison *) lfirst(lc);
+
+ test_nodelist = GetRelationNodes(
+ lit_comp->rel_loc_info, &(lit_comp->constant), true);
+
+ if (nodelist == NULL)
+ nodelist = test_nodelist;
+ else
+ {
+ if (nodelist->length > 1 || test_nodelist->length > 1)
+ /* there should only be one */
+ nodelist = NULL;
+ else
+ {
+ /* Make sure they use the same nodes */
+ if (linitial_int(test_nodelist) != linitial_int(nodelist))
+ nodelist = NULL;
+ }
+ }
+ }
+ }
+ else
+ {
+ /*
+ * At this point, we have partitioned parent child relationship, with
+ * no partitioned column comparison condition with a literal. We just
+ * use one of the tables as a basis for node determination.
+ */
+ opexpr = (OpExpr *) linitial(special_conditions->partitioned_parent_child);
+
+ colvar = (Var *) linitial(opexpr->args);
+
+ /* get the RangeTableEntry */
+ rte = list_nth(query->rtable, colvar->varno - 1);
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+
+ if (!rel_loc_info)
+ return false;
+
+ nodelist = GetRelationNodes(rel_loc_info, NULL, isRead);
+ }
+ free_special_relations(special_conditions);
+ free_join_list();
+
+ return nodelist;
+}
+
+
+/*
+ * get_plan_nodes - determine the nodes to execute the plan on
+ *
+ * return NULL if it is not safe to be done in a single step.
+ */
+static List *
+get_plan_nodes_command(Query_Plan * query_plan, Query * query)
+{
+
+ switch (query->commandType)
+ {
+ case CMD_SELECT:
+ return get_plan_nodes(query_plan, query, true);
+
+ case CMD_INSERT:
+ return get_plan_nodes_insert(query);
+
+ case CMD_UPDATE:
+ /* treat as a select */
+ return get_plan_nodes(query_plan, query, false);
+
+ case CMD_DELETE:
+ /* treat as a select */
+ return get_plan_nodes(query_plan, query, false);
+
+ default:
+ return NULL;
+ }
+}
+
+
+/*
+ * Get list of simple aggregates used.
+ * For now we only allow MAX in the first column, and return a list of one.
+ */
+static List *
+get_simple_aggregates(Query * query, List *nodelist)
+{
+ List *simple_agg_list = NULL;
+
+ /* Check for simple multi-node aggregate */
+ if (nodelist != NULL && nodelist->length > 1 && query->hasAggs)
+ {
+ TargetEntry *tle;
+
+ /*
+ * long term check for group by, but for prototype just allow 1 simple
+ * expression
+ */
+ if (query->targetList->length != 1)
+ return NULL;
+
+ tle = (TargetEntry *) linitial(query->targetList);
+
+ if (IsA(tle->expr, Aggref))
+ {
+ SimpleAgg *simple_agg;
+ Aggref *aggref = (Aggref *) tle->expr;
+
+ /* Just consider numeric max functions for prototype */
+ if (!(aggref->aggfnoid >= 2115 && aggref->aggfnoid <= 2121))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Multinode aggregate for this function currently not supported")));
+ }
+
+ simple_agg = (SimpleAgg *) palloc(sizeof(SimpleAgg));
+ simple_agg->agg_type = AGG_TYPE_MAX;
+ simple_agg->column_pos = 1;
+ simple_agg->agg_data_type = aggref->aggtype;
+ simple_agg->response_count = 0;
+
+ simple_agg_list = lappend(simple_agg_list, simple_agg);
+ }
+ }
+
+ return simple_agg_list;
+}
+
+
+/*
+ * Build up a QueryPlan to execute on.
+ *
+ * For the prototype, there will only be one step,
+ * and the nodelist will be NULL if it is not a PGXC-safe statement.
+ */
+Query_Plan *
+GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list)
+{
+ Query_Plan *query_plan = palloc(sizeof(Query_Plan));
+ Query_Step *query_step = palloc(sizeof(Query_Step));
+ Query *query;
+
+
+ query_plan->force_autocommit = false;
+
+ query_step->sql_statement = (char *) palloc(strlen(sql_statement) + 1);
+ strcpy(query_step->sql_statement, sql_statement);
+ query_step->nodelist = NULL;
+ query_step->simple_aggregates = NULL;
+
+ query_plan->query_step_list = lappend(NULL, query_step);
+
+ /*
+ * Determine where to execute the command, either at the Coordinator
+ * level, Data Nodes, or both. By default we choose both. We should be
+ * able to quickly expand this for more commands.
+ */
+ switch (nodeTag(parsetree))
+ {
+ case T_SelectStmt:
+ case T_InsertStmt:
+ case T_UpdateStmt:
+ case T_DeleteStmt:
+ /* just use first one in querytree_list */
+ query = (Query *) linitial(querytree_list);
+ query_step->nodelist =
+ get_plan_nodes_command(query_plan, query);
+ query_step->simple_aggregates =
+ get_simple_aggregates(query, query_step->nodelist);
+
+ /*
+ * See if it is a SELECT with no relations, like SELECT 1+1 or
+ * SELECT nextval('fred'), and just use coord.
+ */
+ query = (Query *) linitial(querytree_list);
+ if (query_step->nodelist == NULL
+ && (query->jointree->fromlist == NULL
+ || query->jointree->fromlist->length == 0))
+ /* Just execute it on Coordinator */
+ query_plan->exec_loc_type = EXEC_ON_COORD;
+ else
+ {
+ query_plan->exec_loc_type = EXEC_ON_DATA_NODES;
+
+ if (query_step->nodelist == NULL)
+ {
+ bool is_pg_catalog = false;
+
+ /* before giving up, see if we are dealing with pg_catalog */
+ if (nodeTag(parsetree) == T_SelectStmt)
+ {
+ ListCell *lc;
+
+ is_pg_catalog = true;
+ foreach(lc, query->rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+
+ /* hack so that pg_catalog queries can run */
+ if (rte->relid >= FirstNormalObjectId)
+ {
+ is_pg_catalog = false;
+ break;
+ }
+ }
+ if (is_pg_catalog)
+ query_plan->exec_loc_type = EXEC_ON_COORD;
+ }
+
+ /*
+ * If the nodelist is NULL, it is not safe for us to
+ * execute
+ */
+ if (!is_pg_catalog && StrictStatementChecking)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Cannot safely execute statement in a single step."))));
+ }
+ }
+
+ /*
+ * PG-XC cannot yet support some variations of SQL statements.
+ * We perform some checks to at least catch common cases
+ */
+
+ /*
+ * Check if we have multiple nodes and an unsupported clause. This
+ * is temporary until we expand supported SQL
+ */
+ if (nodeTag(parsetree) == T_SelectStmt)
+ {
+ if (query->intoClause)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("INTO clause not yet supported"))));
+
+ if (query->setOperations)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("UNION, INTERSECT and EXCEPT are not yet supported"))));
+
+ if (query_step->nodelist && query_step->nodelist->length > 1 && StrictStatementChecking)
+ {
+ /*
+ * PGXCTODO - this could be improved to check if the first
+ * group by expression is the partitioning column
+ */
+ if (query->groupClause)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Multi-node GROUP BY not yet supported"))));
+ if (query->limitCount && StrictSelectChecking)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Multi-node LIMIT not yet supported"))));
+ if (query->sortClause && StrictSelectChecking)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Multi-node ORDER BY not yet supported"))));
+ /* PGXCTODO - check if first column partitioning column */
+ if (query->distinctClause)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Multi-node DISTINCT`not yet supported"))));
+ if (query->hasAggs)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("Multi-node aggregates not yet supported"))));
+ }
+ }
+ break;
+
+ /* Statements that we only want to execute on the Coordinator */
+ case T_AlterSeqStmt:
+ case T_CommentStmt:
+ case T_CreateSeqStmt:
+ case T_VariableShowStmt:
+ query_plan->exec_loc_type = EXEC_ON_COORD;
+ break;
+
+ /* DROP */
+ case T_DropStmt:
+ if (((DropStmt *) parsetree)->removeType == OBJECT_SEQUENCE)
+ query_plan->exec_loc_type = EXEC_ON_COORD;
+ else
+ query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+ break;
+
+ /*
+ * Statements that need to run in autocommit mode, on Coordinator
+ * and Data Nodes with suppressed implicit two phase commit.
+ */
+ case T_CheckPointStmt:
+ case T_ClusterStmt:
+ case T_CreatedbStmt:
+ case T_DropdbStmt:
+ case T_VacuumStmt:
+ query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+ query_plan->force_autocommit = true;
+ break;
+
+ /*
+ * Statements that we execute on both the Coordinator and Data Nodes
+ */
+ case T_AlterTableStmt:
+ case T_AlterDatabaseStmt:
+ case T_AlterDatabaseSetStmt:
+ case T_AlterDomainStmt:
+ case T_AlterObjectSchemaStmt:
+ case T_ConstraintsSetStmt:
+ case T_CreateDomainStmt:
+ case T_CreateEnumStmt:
+ case T_CreateStmt:
+ case T_CreateSchemaStmt:
+ case T_DeallocateStmt: /* Allow for DEALLOCATE ALL */
+ case T_DiscardStmt:
+ case T_IndexStmt:
+ case T_LockStmt:
+ case T_ReindexStmt:
+ case T_RenameStmt:
+ case T_TruncateStmt:
+ case T_VariableSetStmt:
+
+ /*
+ * Also support these, should help later with pg_restore, although
+ * not very useful because of the pooler using the same user
+ */
+ case T_GrantStmt:
+ case T_GrantRoleStmt:
+ case T_CreateRoleStmt:
+ case T_AlterRoleStmt:
+ case T_DropRoleStmt:
+ case T_AlterOwnerStmt:
+ case T_DropOwnedStmt:
+ case T_ReassignOwnedStmt:
+ query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+ break;
+
+
+ case T_TransactionStmt:
+ switch (((TransactionStmt *) parsetree)->kind)
+ {
+ case TRANS_STMT_SAVEPOINT:
+ case TRANS_STMT_RELEASE:
+ case TRANS_STMT_ROLLBACK_TO:
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("This type of transaction statement not yet supported"))));
+ break;
+
+ default:
+ break; /* keep compiler quiet */
+ }
+ query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+ break;
+
+ /*
+ * For now, pick one of the data nodes until we modify real
+ * planner It will give an approximate idea of what an isolated
+ * data node will do
+ */
+ case T_ExplainStmt:
+ query_step->nodelist = lappend_int(query_step->nodelist, GetAnyDataNode());
+ query_plan->exec_loc_type = EXEC_ON_DATA_NODES;
+ break;
+
+ /*
+ * Statements we do not yet want to handle.
+ * By default they would be fobidden, but we list these for reference.
+ * Note that there is not a 1-1 correspndence between
+ * SQL command and the T_*Stmt structures.
+ */
+ case T_AlterFdwStmt:
+ case T_AlterForeignServerStmt:
+ case T_AlterFunctionStmt:
+ case T_AlterOpFamilyStmt:
+ case T_AlterTSConfigurationStmt:
+ case T_AlterTSDictionaryStmt:
+ case T_AlterUserMappingStmt:
+ case T_ClosePortalStmt:
+ case T_CompositeTypeStmt:
+ case T_CreateCastStmt:
+ case T_CreateConversionStmt:
+ case T_CreateFdwStmt:
+ case T_CreateFunctionStmt:
+ case T_CreateForeignServerStmt:
+ case T_CreateOpClassStmt:
+ case T_CreateOpFamilyStmt:
+ case T_CreatePLangStmt:
+ case T_CreateTableSpaceStmt:
+ case T_CreateTrigStmt:
+ case T_CreateUserMappingStmt:
+ case T_DeclareCursorStmt:
+ case T_DefineStmt: /* used for aggregates, some types */
+ case T_DropCastStmt:
+ case T_DropFdwStmt:
+ case T_DropForeignServerStmt:
+ case T_DropPLangStmt:
+ case T_DropPropertyStmt:
+ case T_DropTableSpaceStmt:
+ case T_ExecuteStmt:
+ case T_FetchStmt:
+ case T_ListenStmt:
+ case T_LoadStmt:
+ case T_NotifyStmt:
+ case T_PrepareStmt:
+ case T_RemoveFuncStmt:
+ case T_RemoveOpClassStmt:
+ case T_RemoveOpFamilyStmt:
+ case T_RuleStmt:
+ case T_UnlistenStmt:
+ case T_ViewStmt:
+ /* fall through */
+ default:
+ /* Allow for override */
+ if (StrictStatementChecking)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("This command is not yet supported."))));
+ else
+ query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+ break;
+ }
+
+
+ return query_plan;
+}
+
+
+/*
+ * Free Query_Step struct
+ */
+static void
+free_query_step(Query_Step * query_step)
+{
+ if (query_step == NULL)
+ return;
+
+ pfree(query_step->sql_statement);
+ list_free(query_step->nodelist);
+ if (query_step->simple_aggregates != NULL)
+ list_free_deep(query_step->simple_aggregates);
+ pfree(query_step);
+}
+
+/*
+ * Free Query_Plan struct
+ */
+void
+FreeQueryPlan(Query_Plan * query_plan)
+{
+ ListCell *item;
+
+ if (query_plan == NULL)
+ return;
+
+ foreach(item, query_plan->query_step_list)
+ {
+ free_query_step((Query_Step *) lfirst_int(item));
+ }
+
+ pfree(query_plan->query_step_list);
+ pfree(query_plan);
+}
diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile
new file mode 100644
index 0000000000..7143af5d97
--- /dev/null
+++ b/src/backend/pgxc/pool/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for pool
+#
+# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/pool
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = combiner.o datanode.o poolmgr.o poolcomm.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/pool/combiner.c b/src/backend/pgxc/pool/combiner.c
new file mode 100644
index 0000000000..da59c5f6af
--- /dev/null
+++ b/src/backend/pgxc/pool/combiner.c
@@ -0,0 +1,375 @@
+/*-------------------------------------------------------------------------
+ *
+ * combiner.c
+ *
+ * Combine responses from multiple Data Nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "pgxc/combiner.h"
+#include "pgxc/planner.h"
+#include "catalog/pg_type.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Create a structure to store parameters needed to combine responses from
+ * multiple connections as well as state information
+ */
+ResponseCombiner
+CreateResponseCombiner(int node_count, CombineType combine_type,
+ CommandDest dest)
+{
+ ResponseCombiner combiner;
+
+ /* ResponseComber is a typedef for pointer to ResponseCombinerData */
+ combiner = (ResponseCombiner) palloc(sizeof(ResponseCombinerData));
+ if (combiner == NULL)
+ {
+ /* Out of memory */
+ return combiner;
+ }
+
+ combiner->node_count = node_count;
+ combiner->combine_type = combine_type;
+ combiner->dest = dest;
+ combiner->command_complete_count = 0;
+ combiner->row_count = 0;
+ combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+ combiner->description_count = 0;
+ combiner->simple_aggregates = NULL;
+
+ return combiner;
+}
+
+/*
+ * Parse out row count from the command status response and convert it to integer
+ */
+static int
+parse_row_count(const char *message, size_t len, int *rowcount)
+{
+ int digits = 0;
+
+ *rowcount = 0;
+ /* skip \0 string terminator */
+ len--;
+ while (len-- > 0 && message[len] >= '0' && message[len] <= '9')
+ {
+ *rowcount = *rowcount * 10 + message[len] - '0';
+ digits++;
+ }
+ return digits;
+}
+
+/*
+ * Extract the aggregate element result
+ * returns a boolean indicating whether or not it was a short message
+ */
+static unsigned long
+parse_aggregate_value(SimpleAgg * simple_agg, char *msg_body, size_t len)
+{
+ char *valstr;
+
+ Assert(len >= 7);
+
+ /* PGXCTODO - handle pos (position) */
+ /* PGXCTODO - handle other types like TEXT */
+
+ /* skip first 2 bytes */
+ if (simple_agg->data_len == 0)
+ memcpy(&(simple_agg->data_len), &(msg_body[2]), 4);
+
+ valstr = (char *) palloc(simple_agg->data_len + 1);
+ strncpy(valstr, &(msg_body[6]), simple_agg->data_len);
+ valstr[simple_agg->data_len - 1] = '\0';
+
+ return atol(valstr);
+}
+
+
+/*
+ * Process a result from a node for the aggregate function
+ * returns a boolean indicating whether or not it was a short message
+ */
+static void
+process_aggregate_element(List *simple_aggregates, char *msg_body, size_t len)
+{
+ ListCell *lc;
+
+ foreach(lc, simple_aggregates)
+ {
+ unsigned long col_value;
+ SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc);
+
+ /* PGXCTODO may need to support numeric, too. */
+ col_value = parse_aggregate_value(simple_agg, msg_body, len);
+
+ switch (simple_agg->agg_type)
+ {
+ case AGG_TYPE_MAX:
+ /* If it is the first one, take it */
+ if (simple_agg->response_count == 0)
+ {
+ /* PGXCTODO - type checking */
+ simple_agg->ulong_value = col_value;
+ }
+ else
+ {
+ if (col_value > simple_agg->ulong_value)
+ simple_agg->ulong_value = col_value;
+ }
+ break;
+
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Unknown aggregate type: %d",
+ simple_agg->agg_type)));
+ }
+
+ }
+}
+
+
+/*
+ * Handle response message and update combiner's state.
+ * This function contains main combiner logic
+ */
+int
+CombineResponse(ResponseCombiner combiner, char msg_type, char *msg_body, size_t len)
+{
+ int rowcount;
+ int digits = 0;
+
+ switch (msg_type)
+ {
+ case 'C': /* CommandComplete */
+ /*
+ * If we did not receive description we are having rowcount or OK
+ * response
+ */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COMMAND;
+ /* Extract rowcount */
+ if (combiner->combine_type != COMBINE_TYPE_NONE)
+ {
+ digits = parse_row_count(msg_body, len, &rowcount);
+ if (digits > 0)
+ combiner->row_count += rowcount;
+ else
+ combiner->combine_type = COMBINE_TYPE_NONE;
+ }
+ if (++combiner->command_complete_count == combiner->node_count)
+ {
+
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ {
+ if (combiner->combine_type == COMBINE_TYPE_NONE)
+ {
+ pq_putmessage(msg_type, msg_body, len);
+ }
+ else
+ {
+ char command_complete_buffer[256];
+
+ rowcount = combiner->combine_type == COMBINE_TYPE_SUM ?
+ combiner->row_count :
+ combiner->row_count / combiner->node_count;
+ /* Truncate msg_body to get base string */
+ msg_body[len - digits - 1] = '\0';
+ len = sprintf(command_complete_buffer, "%s%d", msg_body, rowcount) + 1;
+ pq_putmessage(msg_type, command_complete_buffer, len);
+ }
+ }
+ }
+ break;
+ case 'T': /* RowDescription */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_QUERY;
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ return EOF;
+ }
+ /* Proxy first */
+ if (combiner->description_count++ == 0)
+ {
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ pq_putmessage(msg_type, msg_body, len);
+ }
+ break;
+ case 'G': /* CopyInResponse */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_IN;
+ if (combiner->request_type != REQUEST_TYPE_COPY_IN)
+ {
+ /* Inconsistent responses */
+ return EOF;
+ }
+ /* Proxy first */
+ if (combiner->description_count++ == 0)
+ {
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ pq_putmessage(msg_type, msg_body, len);
+ }
+ break;
+ case 'H': /* CopyOutResponse */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ {
+ /* Inconsistent responses */
+ return EOF;
+ }
+ /* Proxy first */
+ if (combiner->description_count++ == 0)
+ {
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ pq_putmessage(msg_type, msg_body, len);
+ }
+ break;
+ case 'D': /* DataRow */
+ if (combiner->simple_aggregates == NULL)
+ {
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ pq_putmessage(msg_type, msg_body, len);
+ }
+ else
+ {
+ SimpleAgg *simple_agg = (SimpleAgg *) linitial(combiner->simple_aggregates);
+
+ /* Handle aggregates */
+ /* Process single node result */
+ process_aggregate_element(
+ combiner->simple_aggregates,
+ msg_body, len);
+
+ /*
+ * See if we are done with all nodes. Only then do we send one
+ * DataRow result.
+ */
+
+ if (++simple_agg->response_count
+ == combiner->node_count)
+ {
+ char longstr[21];
+ int longlen;
+
+ StringInfo data_buffer;
+
+ data_buffer = makeStringInfo();
+
+ /*
+ * longlen = sprintf(longstr, "%lu",
+ * simple_agg->ulong_value);
+ */
+
+ pg_ltoa(simple_agg->ulong_value, longstr);
+ longlen = strlen(longstr);
+
+ pq_beginmessage(data_buffer, 'D');
+ pq_sendbyte(data_buffer, msg_body[0]);
+ pq_sendbyte(data_buffer, msg_body[1]);
+ pq_sendint(data_buffer, longlen, 4);
+ pq_sendtext(data_buffer, longstr, longlen);
+ pq_putmessage(msg_type,
+ data_buffer->data,
+ data_buffer->len);
+
+ pfree(data_buffer->data);
+ pfree(data_buffer);
+ }
+ }
+ break;
+ case 'E': /* ErrorResponse */
+ case 'A': /* NotificationResponse */
+ case 'N': /* NoticeResponse */
+ /* Always proxy */
+ if (combiner->dest == DestRemote
+ || combiner->dest == DestRemoteExecute)
+ pq_putmessage(msg_type, msg_body, len);
+ break;
+ case 'I': /* EmptyQuery */
+ default:
+ /* Unexpected message */
+ return EOF;
+ }
+ return 0;
+}
+
+/*
+ * Examine the specified combiner state and determine if command was completed
+ * successfully
+ */
+static bool
+validate_combiner(ResponseCombiner combiner)
+{
+ /* Check all nodes completed */
+ if (combiner->command_complete_count != combiner->node_count)
+ return false;
+
+ /* Check count of description responses */
+ if (combiner->request_type != REQUEST_TYPE_COMMAND
+ && combiner->description_count != combiner->node_count)
+ return false;
+
+ /* Add other checks here as needed */
+
+ /* All is good if we are here */
+ return true;
+}
+
+/*
+ * Validate combiner and release storage freeing allocated memory
+ */
+bool
+ValidateAndCloseCombiner(ResponseCombiner combiner)
+{
+ bool valid = validate_combiner(combiner);
+
+ pfree(combiner);
+
+ return valid;
+}
+
+/*
+ * Validate combiner and reset storage
+ */
+bool
+ValidateAndResetCombiner(ResponseCombiner combiner)
+{
+ bool valid = validate_combiner(combiner);
+
+ combiner->command_complete_count = 0;
+ combiner->row_count = 0;
+ combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+ combiner->description_count = 0;
+ combiner->simple_aggregates = NULL;
+
+ return valid;
+}
+
+/*
+ * Assign combiner aggregates
+ */
+void
+AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates)
+{
+ combiner->simple_aggregates = simple_aggregates;
+}
diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c
new file mode 100644
index 0000000000..9b3d40a785
--- /dev/null
+++ b/src/backend/pgxc/pool/datanode.c
@@ -0,0 +1,1701 @@
+/*-------------------------------------------------------------------------
+ *
+ * datanode.c
+ *
+ * Functions for the coordinator communicating with the data nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "pgxc/poolmgr.h"
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "postgres.h"
+#include "utils/snapmgr.h"
+#include "gtm/gtm_c.h"
+#include "pgxc/datanode.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+
+#define NO_SOCKET -1
+
+static int node_count = 0;
+static DataNodeHandle *handles = NULL;
+static bool autocommit = true;
+static DataNodeHandle **write_node_list = NULL;
+static int write_node_count = 0;
+
+static DataNodeHandle **get_handles(List *nodelist);
+static int get_transaction_nodes(DataNodeHandle ** connections);
+static void release_handles(void);
+
+static void data_node_init(DataNodeHandle * handle, int sock);
+static void data_node_free(DataNodeHandle * handle);
+
+static int data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid);
+static int data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner);
+static int data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner);
+
+static int ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle);
+static int ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle);
+
+static int data_node_send_query(DataNodeHandle * handle, const char *query);
+static int data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid);
+static int data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot);
+
+static void add_error_message(DataNodeHandle * handle, const char *message);
+
+static int data_node_read_data(DataNodeHandle * conn);
+static int handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState);
+
+static int get_int(DataNodeHandle * conn, size_t len, int *out);
+static int get_char(DataNodeHandle * conn, char *out);
+
+static void clear_write_node_list();
+
+#define MAX_STATEMENTS_PER_TRAN 10
+
+/* Variables to collect statistics */
+static int total_transactions = 0;
+static int total_statements = 0;
+static int total_autocommit = 0;
+static int nonautocommit_2pc = 0;
+static int autocommit_2pc = 0;
+static int current_tran_statements = 0;
+static int *statements_per_transaction = NULL;
+static int *nodes_per_transaction = NULL;
+
+/*
+ * statistics collection: count a statement
+ */
+static void
+stat_statement()
+{
+ total_statements++;
+ current_tran_statements++;
+}
+
+/*
+ * To collect statistics: count a transaction
+ */
+static void
+stat_transaction(int node_count)
+{
+ total_transactions++;
+ if (autocommit)
+ total_autocommit++;
+ if (!statements_per_transaction)
+ {
+ statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ }
+ if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+ statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+ else
+ statements_per_transaction[current_tran_statements]++;
+ current_tran_statements = 0;
+ if (node_count > 0 && node_count <= NumDataNodes)
+ {
+ if (!nodes_per_transaction)
+ {
+ nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+ memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+ }
+ nodes_per_transaction[node_count - 1]++;
+ }
+}
+
+
+/*
+ * To collect statistics: count a two-phase commit on nodes
+ */
+static void
+stat_2pc()
+{
+ if (autocommit)
+ autocommit_2pc++;
+ else
+ nonautocommit_2pc++;
+}
+
+
+/*
+ * Output collected statistics to the log
+ */
+static void
+stat_log()
+{
+ elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+ elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+ total_autocommit, autocommit_2pc, nonautocommit_2pc);
+ if (total_transactions)
+ {
+ if (statements_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+ elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+ i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+ }
+ elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+ MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+ if (nodes_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < NumDataNodes; i++)
+ elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+ i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+ }
+ }
+}
+
+/*
+ * Allocate and initialize memory to store DataNode handles.
+ */
+void
+InitMultinodeExecutor()
+{
+ int i;
+
+ /* This function could get called multiple times because of sigjmp */
+ if (handles != NULL)
+ return;
+
+ /*
+ * Should be in TopMemoryContext.
+ * Assume the caller takes care of context switching
+ */
+ handles = (DataNodeHandle *) palloc(NumDataNodes * sizeof(DataNodeHandle));
+ if (!handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* initialize storage then */
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ /*
+ * Socket descriptor is small non-negative integer,
+ * Indicate the handle is not initialized yet
+ */
+ handles[i].sock = NO_SOCKET;
+
+ /* Initialise buffers */
+ handles[i].error = NULL;
+ handles[i].outSize = 16 * 1024;
+ handles[i].outBuffer = (char *) palloc(handles[i].outSize);
+ handles[i].inSize = 16 * 1024;
+ handles[i].inBuffer = (char *) palloc(handles[i].inSize);
+
+ if (handles[i].outBuffer == NULL || handles[i].inBuffer == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+
+ node_count = 0;
+}
+
+/*
+ * Builds up a connection string
+ */
+char *
+DataNodeConnStr(char *host, char *port, char *dbname,
+ char *user, char *password)
+{
+ char *out,
+ connstr[256];
+ int num;
+
+ /* Build up connection string */
+ num = snprintf(connstr, sizeof(connstr),
+ "host=%s port=%s dbname=%s user=%s password=%s",
+ host, port, dbname, user, password);
+
+ /* Check for overflow */
+ if (num > 0 && num < sizeof(connstr))
+ {
+ /* Output result */
+ out = (char *) palloc(num + 1);
+ strcpy(out, connstr);
+ return out;
+ }
+
+ /* return NULL if we have problem */
+ return NULL;
+}
+
+
+/*
+ * Connect to a Data Node using a connection string
+ */
+NODE_CONNECTION *
+DataNodeConnect(char *connstr)
+{
+ PGconn *conn;
+
+ /* Delegate call to the pglib */
+ conn = PQconnectdb(connstr);
+ return (NODE_CONNECTION *) conn;
+}
+
+
+/*
+ * Close specified connection
+ */
+void
+DataNodeClose(NODE_CONNECTION * conn)
+{
+ /* Delegate call to the pglib */
+ PQfinish((PGconn *) conn);
+}
+
+
+/*
+ * Checks if connection active
+ */
+int
+DataNodeConnected(NODE_CONNECTION * conn)
+{
+ /* Delegate call to the pglib */
+ PGconn *pgconn = (PGconn *) conn;
+
+ /*
+ * Simple check, want to do more comprehencive -
+ * check if it is ready for guery
+ */
+ return pgconn && PQstatus(pgconn) == CONNECTION_OK;
+}
+
+
+
+/* Close the socket handle (this process' copy) and free occupied memory
+ *
+ * Note that we do not free the handle and its members. This will be
+ * taken care of when the transaction ends, when TopTransactionContext
+ * is destroyed in xact.c.
+ */
+static void
+data_node_free(DataNodeHandle * handle)
+{
+ close(handle->sock);
+ handle->sock = NO_SOCKET;
+}
+
+
+/*
+ * Create and initialise internal structure to communicate to
+ * Data Node via supplied socket descriptor.
+ * Structure stores state info and I/O buffers
+ */
+static void
+data_node_init(DataNodeHandle * handle, int sock)
+{
+ handle->sock = sock;
+ handle->transaction_status = 'I';
+ handle->state = DN_CONNECTION_STATE_IDLE;
+ handle->error = NULL;
+ handle->outEnd = 0;
+ handle->inStart = 0;
+ handle->inEnd = 0;
+ handle->inCursor = 0;
+}
+
+
+/*
+ * Handle responses from the Data node connections
+ */
+static int
+data_node_receive_responses(int conn_count, DataNodeHandle ** connections,
+ struct timeval * timeout, ResponseCombiner combiner)
+{
+ int result = 0;
+ int retry_count;
+ bool timed_out = false;
+ bool inErrorState = false;
+
+ int count = conn_count;
+ DataNodeHandle *to_receive[conn_count];
+
+ /* make a copy of the pointers to the connections */
+ memcpy(to_receive, connections, conn_count * sizeof(DataNodeHandle *));
+
+ /*
+ * Read results.
+ * Note we try and read from data node connections even if there is an error on one,
+ * so as to avoid reading incorrect results on the next statement.
+ * It might be better to just destroy these connections and tell the pool manager.
+ */
+ while (count > 0)
+ {
+ int i,
+ res_select,
+ nfds = 0;
+ fd_set readfds;
+
+ FD_ZERO(&readfds);
+ for (i = 0; i < count; i++)
+ {
+ /* note if a connection has error */
+ if (!to_receive[i]
+ || to_receive[i]->state == DN_CONNECTION_STATE_ERROR
+ || to_receive[i]->sock >= 1024)
+ {
+ result = EOF;
+
+ /* Handling is done, do not track this connection */
+ count--;
+
+ /* Move last connection in its place */
+ if (i < count)
+ {
+ to_receive[i] = to_receive[count];
+ /* stay on the current position */
+ i--;
+ }
+ continue;
+ }
+
+ /* prepare select params */
+ if (nfds < to_receive[i]->sock)
+ nfds = to_receive[i]->sock;
+
+ FD_SET (to_receive[i]->sock, &readfds);
+ }
+
+ /* Make sure we still have valid connections */
+ if (count == 0)
+ break;
+
+ retry_count = 0;
+retry:
+ res_select = select(nfds + 1, &readfds, NULL, NULL, timeout);
+ if (res_select < 0)
+ {
+ /* error - retry if EINTR or EAGAIN */
+ if (errno == EINTR || errno == EAGAIN)
+ goto retry;
+
+ /*
+ * PGXCTODO - we may want to close the connections and notify the
+ * pooler that these are invalid.
+ */
+ if (errno == EBADF)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("select() bad file descriptor set")));
+ return EOF;
+ }
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("select() error: %d", errno)));
+ return EOF;
+ }
+
+ if (res_select == 0)
+ {
+ /* Handle timeout */
+ result = EOF;
+ timed_out = true;
+ }
+
+ /* read data */
+ for (i = 0; i < count; i++)
+ {
+ DataNodeHandle *conn = to_receive[i];
+
+ if (FD_ISSET(conn->sock, &readfds))
+ {
+ int read_status = data_node_read_data(conn);
+
+ if (read_status == EOF || read_status < 0)
+ {
+ count--;
+ /* Move last connection in place */
+ if (i < count)
+ {
+ to_receive[i] = to_receive[count];
+ /* stay on the current position */
+ i--;
+ }
+
+ inErrorState = true;
+ result = EOF;
+ continue;
+ }
+ }
+
+ if (conn->inStart < conn->inEnd)
+ {
+ if (handle_response(conn, combiner, inErrorState) == 0)
+ {
+ /* Handling is done, do not track this connection */
+ count--;
+ /* Move last connection in place */
+ if (i < count)
+ {
+ to_receive[i] = to_receive[count];
+ /* stay on the current position */
+ i--;
+ }
+ }
+
+ /*
+ * See if we flagged an error on connection. Note, if
+ * handle_response was not 0 above, an error occurred, we
+ * still need to consume the ReadyForQuery message
+ */
+ if (conn->state == DN_CONNECTION_STATE_ERROR)
+ {
+ inErrorState = true;
+ result = EOF;
+ }
+ }
+ }
+ }
+
+ return result;
+}
+
+/*
+ * Read up incoming messages from the Data ndoe connection
+ */
+static int
+data_node_read_data(DataNodeHandle * conn)
+{
+ int someread = 0;
+ int nread;
+
+ if (conn->sock < 0)
+ {
+ add_error_message(conn, "bad socket");
+ return EOF;
+ }
+
+ /* Left-justify any data in the buffer to make room */
+ if (conn->inStart < conn->inEnd)
+ {
+ if (conn->inStart > 0)
+ {
+ memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+ conn->inEnd - conn->inStart);
+ conn->inEnd -= conn->inStart;
+ conn->inCursor -= conn->inStart;
+ conn->inStart = 0;
+ }
+ }
+ else
+ {
+ /* buffer is logically empty, reset it */
+ conn->inStart = conn->inCursor = conn->inEnd = 0;
+ }
+
+ /*
+ * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+ * the buffer in case a single message exceeds the initial buffer size. We
+ * enlarge before filling the buffer entirely so as to avoid asking the
+ * kernel for a partial packet. The magic constant here should be large
+ * enough for a TCP packet or Unix pipe bufferload. 8K is the usual pipe
+ * buffer size, so...
+ */
+ if (conn->inSize - conn->inEnd < 8192)
+ {
+ if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0)
+ {
+ /*
+ * We don't insist that the enlarge worked, but we need some room
+ */
+ if (conn->inSize - conn->inEnd < 100)
+ {
+ add_error_message(conn, "can not allocate buffer");
+ return -1;
+ }
+ }
+ }
+
+retry:
+ nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+ conn->inSize - conn->inEnd, 0);
+
+ if (nread < 0)
+ {
+ elog(DEBUG1, "dnrd errno = %d", errno);
+ if (errno == EINTR)
+ goto retry;
+ /* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+ if (errno == EAGAIN)
+ return someread;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ if (errno == EWOULDBLOCK)
+ return someread;
+#endif
+ /* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+ if (errno == ECONNRESET)
+ {
+ /*
+ * OK, we are getting a zero read even though select() says ready. This
+ * means the connection has been closed. Cope.
+ */
+ add_error_message(conn,
+ "data node closed the connection unexpectedly\n"
+ "\tThis probably means the data node terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+ conn->state = DN_CONNECTION_STATE_ERROR; /* No more connection to
+ * backend */
+ closesocket(conn->sock);
+ conn->sock = NO_SOCKET;
+
+ return -1;
+ }
+#endif
+ add_error_message(conn, "could not receive data from server");
+ return -1;
+
+ }
+
+ if (nread > 0)
+ {
+ conn->inEnd += nread;
+
+ /*
+ * Hack to deal with the fact that some kernels will only give us back
+ * 1 packet per recv() call, even if we asked for more and there is
+ * more available. If it looks like we are reading a long message,
+ * loop back to recv() again immediately, until we run out of data or
+ * buffer space. Without this, the block-and-restart behavior of
+ * libpq's higher levels leads to O(N^2) performance on long messages.
+ *
+ * Since we left-justified the data above, conn->inEnd gives the
+ * amount of data already read in the current message. We consider
+ * the message "long" once we have acquired 32k ...
+ */
+ if (conn->inEnd > 32768 &&
+ (conn->inSize - conn->inEnd) >= 8192)
+ {
+ someread = 1;
+ goto retry;
+ }
+ return 1;
+ }
+
+ if (nread == 0)
+ {
+ elog(DEBUG1, "nread returned 0");
+ return EOF;
+ }
+
+ if (someread)
+ return 1; /* got a zero read after successful tries */
+
+ return 0;
+}
+
+/*
+ * Get one character from the connection buffer and advance cursor
+ */
+static int
+get_char(DataNodeHandle * conn, char *out)
+{
+ if (conn->inCursor < conn->inEnd)
+ {
+ *out = conn->inBuffer[conn->inCursor++];
+ return 0;
+ }
+ return EOF;
+}
+
+/*
+ * Read an integer from the connection buffer and advance cursor
+ */
+static int
+get_int(DataNodeHandle * conn, size_t len, int *out)
+{
+ unsigned short tmp2;
+ unsigned int tmp4;
+
+ if (conn->inCursor + len > conn->inEnd)
+ return EOF;
+
+ switch (len)
+ {
+ case 2:
+ memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+ conn->inCursor += 2;
+ *out = (int) ntohs(tmp2);
+ break;
+ case 4:
+ memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+ conn->inCursor += 4;
+ *out = (int) ntohl(tmp4);
+ break;
+ default:
+ add_error_message(conn, "not supported int size");
+ return EOF;
+ }
+
+ return 0;
+}
+
+/*
+ * Read next message from the connection and update the combiner accordingly
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ */
+static int
+handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState)
+{
+ char msg_type;
+ int msg_len;
+ bool connError = false;
+
+ for (;;)
+ {
+ /* try to read the message, return if not enough data */
+ conn->inCursor = conn->inStart;
+ if (conn->inEnd - conn->inCursor < 5)
+ return EOF;
+
+ if (get_char(conn, &msg_type))
+ return EOF;
+
+ if (get_int(conn, 4, &msg_len))
+ return EOF;
+
+ msg_len -= 4;
+
+ if (conn->inEnd - conn->inCursor < msg_len)
+ {
+ ensure_in_buffer_capacity(conn->inCursor + (size_t) msg_len, conn);
+ return EOF;
+ }
+
+ /* TODO handle other possible responses */
+ switch (msg_type)
+ {
+ case 'C': /* CommandComplete */
+ /* no need to parse, just move cursor */
+ conn->inCursor += msg_len;
+ conn->state = DN_CONNECTION_STATE_COMPLETED;
+ if (!inErrorState)
+ CombineResponse(combiner, msg_type,
+ conn->inBuffer + conn->inStart + 5,
+ conn->inCursor - conn->inStart - 5);
+
+ break;
+ case 'T': /* RowDescription */
+ case 'G': /* CopyInResponse */
+ case 'H': /* CopyOutResponse */
+ case 'D': /* DataRow */
+ /* no need to parse, just move cursor */
+ conn->inCursor += msg_len;
+ if (!inErrorState)
+ CombineResponse(combiner, msg_type,
+ conn->inBuffer + conn->inStart + 5,
+ conn->inCursor - conn->inStart - 5);
+ break;
+ case 'E': /* ErrorResponse */
+ /* no need to parse, just move cursor */
+ conn->inCursor += msg_len;
+ if (!inErrorState)
+ CombineResponse(combiner, msg_type,
+ conn->inBuffer + conn->inStart + 5,
+ conn->inCursor - conn->inStart - 5);
+ conn->inStart = conn->inCursor;
+ connError = inErrorState = true;
+ /* conn->state = DN_CONNECTION_STATE_ERROR; */
+
+ /*
+ * Do not return with an error, we still need to consume Z,
+ * ready-for-query
+ */
+ break;
+ case 'A': /* NotificationResponse */
+ case 'N': /* NoticeResponse */
+ conn->inCursor += msg_len;
+
+ /*
+ * Ignore these to prevent multiple messages, one from each
+ * node. Coordinator will send one for DDL anyway
+ */
+ break;
+ case 'Z': /* ReadyForQuery */
+ get_char(conn, &conn->transaction_status);
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->inStart = conn->inCursor;
+ /* Now it is ok to flag the connection as having an error */
+ if (connError)
+ {
+ conn->state = DN_CONNECTION_STATE_ERROR;
+ return EOF;
+ }
+ return 0;
+ case 'I': /* EmptyQuery */
+ default:
+ /* sync lost? */
+ conn->state = DN_CONNECTION_STATE_ERROR;
+ inErrorState = true;
+ return EOF;
+ }
+ conn->inStart = conn->inCursor;
+
+ }
+ return EOF;
+}
+
+
+/*
+ * Send BEGIN command to the Data nodes and receive responses
+ */
+static int
+data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid)
+{
+ int i;
+ struct timeval *timeout = NULL;
+
+ /* Send BEGIN */
+ for (i = 0; i < conn_count; i++)
+ {
+ if (GlobalTransactionIdIsValid(gxid) && data_node_send_gxid(connections[i], gxid))
+ return EOF;
+
+ if (data_node_send_query(connections[i], "BEGIN"))
+ return EOF;
+ }
+
+ /* Receive responses */
+ if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+ return EOF;
+
+ /* Verify status? */
+
+ return 0;
+}
+
+
+/* Clears the write node list */
+static void
+clear_write_node_list()
+{
+ /* we just malloc once and use counter */
+ if (write_node_list == NULL)
+ {
+ write_node_list = (DataNodeHandle **) malloc(NumDataNodes * sizeof(DataNodeHandle *));
+ }
+ write_node_count = 0;
+}
+
+
+/*
+ * Switch autocommmit mode off, so all subsequent statements will be in the same transaction
+ */
+void
+DataNodeBegin(void)
+{
+ autocommit = false;
+ clear_write_node_list();
+}
+
+
+/*
+ * Commit current transaction, use two-phase commit if necessary
+ */
+int
+DataNodeCommit(CommandDest dest)
+{
+ int res;
+ int tran_count;
+ DataNodeHandle *connections[node_count];
+ ResponseCombiner combiner;
+
+ /* Quick check to make sure we have connections */
+ if (node_count == 0)
+ goto finish;
+
+ /* gather connections to commit */
+ tran_count = get_transaction_nodes(connections);
+
+ /*
+ * If we do not have open transactions we have nothing to commit, just
+ * report success
+ */
+ if (tran_count == 0)
+ goto finish;
+
+ combiner = CreateResponseCombiner(tran_count,
+ COMBINE_TYPE_NONE, dest);
+ res = data_node_commit(tran_count, connections, combiner);
+ if (!ValidateAndCloseCombiner(combiner) || res)
+ return EOF;
+
+finish:
+ /* In autocommit mode statistics is collected in DataNodeExec */
+ if (!autocommit)
+ stat_transaction(node_count);
+ if (!PersistentConnections)
+ release_handles();
+ autocommit = true;
+ clear_write_node_list();
+ return 0;
+}
+
+
+/*
+ * Send COMMIT or PREPARE/COMMIT PREPARED down to the Data nodes and handle responses
+ */
+static int
+data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner)
+{
+ int i;
+ struct timeval *timeout = NULL;
+ char buffer[256];
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ int result = 0;
+
+
+ /* can set this to false to disable temporarily */
+ /* bool do2PC = conn_count > 1; */
+
+ /*
+ * Only use 2PC if more than one node was written to. Otherwise, just send
+ * COMMIT to all
+ */
+ bool do2PC = write_node_count > 1;
+
+ /* Extra XID for Two Phase Commit */
+ GlobalTransactionId two_phase_xid = 0;
+
+ if (do2PC)
+ {
+ stat_2pc();
+
+ /*
+ * Formally we should be using GetCurrentGlobalTransactionIdIfAny() here,
+ * but since we need 2pc, we surely have sent down a command and got
+ * gxid for it. Hence GetCurrentGlobalTransactionId() just returns
+ * already allocated gxid
+ */
+/* #ifdef PGXC_COORD */
+ gxid = GetCurrentGlobalTransactionId();
+/* #endif */
+
+ sprintf(buffer, "PREPARE TRANSACTION 'T%d'", gxid);
+ /* Send PREPARE */
+ for (i = 0; i < conn_count; i++)
+ {
+ if (data_node_send_query(connections[i], buffer))
+ return EOF;
+ }
+
+ /* Receive responses */
+ if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+ return EOF;
+
+ /* Reset combiner */
+ if (!ValidateAndResetCombiner(combiner))
+ return EOF;
+ }
+
+ if (!do2PC)
+ strcpy(buffer, "COMMIT");
+ else
+ {
+ sprintf(buffer, "COMMIT PREPARED 'T%d'", gxid);
+
+ /* We need to use a new xid, the data nodes have reset */
+ two_phase_xid = BeginTranGTM();
+ for (i = 0; i < conn_count; i++)
+ {
+ if (data_node_send_gxid(connections[i], two_phase_xid))
+ {
+ add_error_message(connections[i], "Can not send request");
+ result = EOF;
+ goto finish;
+ }
+ }
+ }
+
+ /* Send COMMIT */
+ for (i = 0; i < conn_count; i++)
+ {
+ if (data_node_send_query(connections[i], buffer))
+ {
+ result = EOF;
+ goto finish;
+ }
+ }
+
+ /* Receive responses */
+ if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+ result = EOF;
+
+finish:
+ if (do2PC)
+ CommitTranGTM((GlobalTransactionId) two_phase_xid);
+
+ return result;
+}
+
+
+/*
+ * Rollback current transaction
+ */
+int
+DataNodeRollback(CommandDest dest)
+{
+ int res = 0;
+ int tran_count;
+ DataNodeHandle *connections[node_count];
+ ResponseCombiner combiner;
+ int i;
+
+ /* Quick check to make sure we have connections */
+ if (node_count == 0)
+ goto finish;
+
+ /* gather connections to rollback */
+ tran_count = get_transaction_nodes(connections);
+
+ /*
+ * If we do not have open transactions we have nothing to rollback just
+ * report success
+ */
+ if (tran_count == 0)
+ goto finish;
+
+ combiner = CreateResponseCombiner(tran_count,
+ COMBINE_TYPE_NONE, dest);
+ res = data_node_rollback(tran_count, connections, combiner);
+
+ /* Assume connection got cleaned up. Reset so we can reuse without error. */
+ for (i = 0; i < tran_count; i++)
+ {
+ connections[i]->transaction_status = 'I';
+ connections[i]->state = DN_CONNECTION_STATE_IDLE;
+ }
+
+ if (!ValidateAndCloseCombiner(combiner) || res)
+ res = EOF;
+
+finish:
+ /* In autocommit mode statistics is collected in DataNodeExec */
+ if (!autocommit)
+ stat_transaction(node_count);
+ if (!PersistentConnections)
+ release_handles();
+ autocommit = true;
+ clear_write_node_list();
+ return res;
+}
+
+
+/* Release all data node connections back to pool and release occupied memory */
+static void
+release_handles(void)
+{
+ int i;
+
+ if (node_count == 0)
+ return;
+
+ PoolManagerReleaseConnections();
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ DataNodeHandle *handle = &handles[i];
+
+ if (handle->sock != NO_SOCKET)
+ data_node_free(handle);
+ }
+
+ node_count = 0;
+}
+
+
+/*
+ * Send ROLLBACK command down to the Data nodes and handle responses
+ */
+static int
+data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner)
+{
+ int i;
+ struct timeval *timeout = NULL;
+ int result = 0;
+
+ /* Send ROLLBACK - */
+ for (i = 0; i < conn_count; i++)
+ {
+ if (data_node_send_query(connections[i], "ROLLBACK"))
+ result = EOF;
+ }
+
+ /* Receive responses */
+ if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+ return EOF;
+
+ /* Verify status? */
+ return 0;
+}
+
+
+/*
+ * Execute specified statement on specified Data nodes, combine responses and
+ * send results back to the client
+ */
+int
+DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot,
+ bool force_autocommit, List *simple_aggregates, bool is_read_only)
+{
+ int i;
+ int j;
+ int conn_count = list_length(nodelist) == 0 ? NumDataNodes : list_length(nodelist);
+ struct timeval *timeout = NULL; /* wait forever */
+ ResponseCombiner combiner;
+ int res;
+ int newCount = 0;
+ bool need_tran;
+ bool found;
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ DataNodeHandle *newConnections[conn_count];
+ DataNodeHandle **connections;
+
+ if (conn_count == 0)
+ return EOF;
+
+ connections = get_handles(nodelist);
+ if (!connections)
+ return EOF;
+
+ if (force_autocommit)
+ need_tran = false;
+ else
+ need_tran = !autocommit || conn_count > 1;
+
+ elog(DEBUG1, "autocommit = %s, conn_count = %d, need_tran = %s", autocommit ? "true" : "false", conn_count, need_tran ? "true" : "false");
+
+ stat_statement();
+ if (autocommit)
+ stat_transaction(conn_count);
+
+ /* We normally clear for transactions, but if autocommit, clear here, too */
+ if (autocommit == true)
+ {
+ clear_write_node_list();
+ }
+
+ /* Check status of connections */
+
+ /*
+ * We want to track new "write" nodes, and new nodes in the current
+ * transaction whether or not they are write nodes.
+ */
+ if (!is_read_only && write_node_count < NumDataNodes)
+ {
+ for (i = 0; i < conn_count; i++)
+ {
+ found = false;
+ for (j = 0; j < write_node_count && !found; j++)
+ {
+ if (write_node_list[j] == connections[i])
+ found = true;
+ }
+ if (!found)
+ {
+ /* Add to transaction wide-list */
+ write_node_list[write_node_count++] = connections[i];
+ /* Add to current statement list */
+ newConnections[newCount++] = connections[i];
+ }
+ }
+ /* Check connection state is DN_CONNECTION_STATE_IDLE */
+ }
+
+ gxid = GetCurrentGlobalTransactionId();
+
+ if (!GlobalTransactionIdIsValid(gxid))
+ {
+ pfree(connections);
+ return EOF;
+ }
+ if (newCount > 0 && need_tran)
+ {
+ combiner = CreateResponseCombiner(newCount, COMBINE_TYPE_NONE, DestNone);
+
+ /* Start transaction on connections where it is not started */
+ res = data_node_begin(newCount, newConnections, combiner, gxid);
+ if (!ValidateAndCloseCombiner(combiner) || res)
+ {
+ pfree(connections);
+ return EOF;
+ }
+ }
+
+ /* Send query to nodes */
+ for (i = 0; i < conn_count; i++)
+ {
+ /* If explicit transaction is needed gxid is already sent */
+ if (!need_tran && data_node_send_gxid(connections[i], gxid))
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ return EOF;
+ }
+ if (snapshot && data_node_send_snapshot(connections[i], snapshot))
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ return EOF;
+ }
+ if (data_node_send_query(connections[i], query) != 0)
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ return EOF;
+ }
+ }
+
+ combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM, dest);
+ AssignCombinerAggregates(combiner, simple_aggregates);
+
+ /* Receive responses */
+ res = data_node_receive_responses(conn_count, connections, timeout, combiner);
+ if (!ValidateAndCloseCombiner(combiner) || res)
+ {
+ if (autocommit)
+ {
+ if (need_tran)
+ DataNodeRollback(DestNone);
+ else if (!PersistentConnections)
+ release_handles();
+ }
+
+ pfree(connections);
+ return EOF;
+ }
+
+ if (autocommit)
+ {
+ if (need_tran)
+ DataNodeCommit(DestNone); /* PGXCTODO - call CommitTransaction()
+ * instead? */
+ else if (!PersistentConnections)
+ release_handles();
+ }
+
+ /* Verify status? */
+ pfree(connections);
+ return 0;
+}
+
+
+/*
+ * Ensure specified amount of data can fit to the incoming buffer and
+ * increase it if necessary
+ */
+static int
+ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle)
+{
+ int newsize = handle->inSize;
+ char *newbuf;
+
+ if (bytes_needed <= (size_t) newsize)
+ return 0;
+
+ do
+ {
+ newsize *= 2;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = repalloc(handle->inBuffer, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded */
+ handle->inBuffer = newbuf;
+ handle->inSize = newsize;
+ return 0;
+ }
+ }
+
+ newsize = handle->inSize;
+ do
+ {
+ newsize += 8192;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = repalloc(handle->inBuffer, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded */
+ handle->inBuffer = newbuf;
+ handle->inSize = newsize;
+ return 0;
+ }
+ }
+
+ return EOF;
+}
+
+
+/*
+ * Ensure specified amount of data can fit to the outgoing buffer and
+ * increase it if necessary
+ */
+static int
+ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle)
+{
+ int newsize = handle->outSize;
+ char *newbuf;
+
+ if (bytes_needed <= (size_t) newsize)
+ return 0;
+
+ do
+ {
+ newsize *= 2;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = repalloc(handle->outBuffer, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded */
+ handle->outBuffer = newbuf;
+ handle->outSize = newsize;
+ return 0;
+ }
+ }
+
+ newsize = handle->outSize;
+ do
+ {
+ newsize += 8192;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = repalloc(handle->outBuffer, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded */
+ handle->outBuffer = newbuf;
+ handle->outSize = newsize;
+ return 0;
+ }
+ }
+
+ return EOF;
+}
+
+
+/*
+ * Send specified amount of data from the outgoing buffer over the connection
+ */
+static int
+send_some(DataNodeHandle * handle, int len)
+{
+ char *ptr = handle->outBuffer;
+ int remaining = handle->outEnd;
+ int result = 0;
+
+ /* while there's still data to send */
+ while (len > 0)
+ {
+ int sent;
+
+#ifndef WIN32
+ sent = send(handle->sock, ptr, len, 0);
+#else
+ /*
+ * Windows can fail on large sends, per KB article Q201213. The failure-point
+ * appears to be different in different versions of Windows, but 64k should
+ * always be safe.
+ */
+ sent = send(handle->sock, ptr, Min(len, 65536), 0);
+#endif
+
+ if (sent < 0)
+ {
+ /*
+ * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+ * EPIPE or ECONNRESET, assume we've lost the backend connection
+ * permanently.
+ */
+ switch (errno)
+ {
+#ifdef EAGAIN
+ case EAGAIN:
+ break;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ case EWOULDBLOCK:
+ break;
+#endif
+ case EINTR:
+ continue;
+
+ case EPIPE:
+#ifdef ECONNRESET
+ case ECONNRESET:
+#endif
+ add_error_message(handle, "server closed the connection unexpectedly\n"
+ "\tThis probably means the server terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+
+ /*
+ * We used to close the socket here, but that's a bad idea
+ * since there might be unread data waiting (typically, a
+ * NOTICE message from the backend telling us it's
+ * committing hara-kiri...). Leave the socket open until
+ * pqReadData finds no more data can be read. But abandon
+ * attempt to send data.
+ */
+ handle->outEnd = 0;
+ return -1;
+
+ default:
+ add_error_message(handle, "could not send data to server");
+ /* We don't assume it's a fatal error... */
+ handle->outEnd = 0;
+ return -1;
+ }
+ }
+ else
+ {
+ ptr += sent;
+ len -= sent;
+ remaining -= sent;
+ }
+
+ if (len > 0)
+ {
+ /*
+ * We did not send it all
+ * return 1 to indicate that data is still pending.
+ */
+ result = 1;
+ break;
+ }
+ }
+
+ /* shift the remaining contents of the buffer */
+ if (remaining > 0)
+ memmove(handle->outBuffer, ptr, remaining);
+ handle->outEnd = remaining;
+
+ return result;
+}
+
+
+/*
+ * Send specified statement down to the Data node
+ */
+static int
+data_node_send_query(DataNodeHandle * handle, const char *query)
+{
+ int strLen = strlen(query) + 1;
+
+ /* size + strlen */
+ int msgLen = 4 + strLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'Q';
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+ handle->outEnd += strLen;
+
+ /* We need response right away, so send immediately */
+ if (send_some(handle, handle->outEnd) < 0)
+ return EOF;
+
+ handle->state = DN_CONNECTION_STATE_BUSY;
+
+ return 0;
+}
+
+
+/*
+ * Send the GXID down to the Data node
+ */
+static int
+data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid)
+{
+ int msglen = 8;
+ int i32;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'g';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+ i32 = htonl(gxid);
+ memcpy(handle->outBuffer + handle->outEnd, &i32, 4);
+ handle->outEnd += 4;
+
+ return 0;
+}
+
+
+/*
+ * Send the snapshot down to the Data node
+ */
+static int
+data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot)
+{
+ int msglen;
+ int nval;
+ int i;
+
+ /* calculate message length */
+ msglen = 20;
+ if (snapshot->xcnt > 0)
+ msglen += snapshot->xcnt * 4;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 's';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+
+ nval = htonl(snapshot->xmin);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+
+ nval = htonl(snapshot->xmax);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+
+ nval = htonl(snapshot->recent_global_xmin);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+
+ nval = htonl(snapshot->xcnt);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+
+ for (i = 0; i < snapshot->xcnt; i++)
+ {
+ nval = htonl(snapshot->xip[i]);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+ }
+
+ return 0;
+}
+
+/*
+ * Add another message to the list of errors to be returned back to the client
+ * at the convenient time
+ */
+static void
+add_error_message(DataNodeHandle * handle, const char *message)
+{
+ handle->transaction_status = 'E';
+ handle->state = DN_CONNECTION_STATE_ERROR;
+ if (handle->error)
+ {
+ /* PGXCTODO append */
+ }
+ else
+ {
+ handle->error = pstrdup(message);
+ }
+}
+
+/*
+ * for specified list return array of DataNodeHandles
+ * acquire from pool if needed.
+ * the lenth of returned array is the same as of nodelist
+ * Special case is empty or NIL nodeList, in this case return all the nodes.
+ * The returned list should be pfree'd when no longer needed.
+ */
+static DataNodeHandle **
+get_handles(List *nodelist)
+{
+ DataNodeHandle **result;
+ ListCell *node_list_item;
+ List *allocate = NIL;
+
+ /* index of the result array */
+ int i = 0;
+
+ /* If node list is empty execute request on current nodes */
+ if (list_length(nodelist) == 0)
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+ result = (DataNodeHandle **) palloc(NumDataNodes * sizeof(DataNodeHandle *));
+ if (!result)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ result[i] = &handles[i];
+ if (handles[i].sock == NO_SOCKET)
+ allocate = lappend_int(allocate, i + 1);
+ }
+ }
+ else
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+ result = (DataNodeHandle **) palloc(list_length(nodelist) * sizeof(DataNodeHandle *));
+ if (!result)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ i = 0;
+ foreach(node_list_item, nodelist)
+ {
+ int node = node_list_item->data.int_value;
+
+ if (node > NumDataNodes || node <= 0)
+ elog(ERROR, "Node number: %d passed is not a known node", node);
+ result[i++] = &handles[node - 1];
+ if (handles[node - 1].sock == NO_SOCKET)
+ allocate = lappend_int(allocate, node);
+ }
+ }
+
+ if (allocate)
+ {
+ int j = 0;
+ int *fds = PoolManagerGetConnections(allocate);
+
+ if (!fds)
+ {
+ pfree(result);
+ list_free(allocate);
+ return NULL;
+ }
+ foreach(node_list_item, allocate)
+ {
+ int node = node_list_item->data.int_value;
+ int fdsock = fds[j++];
+
+ data_node_init(&handles[node - 1], fdsock);
+ node_count++;
+ }
+ pfree(fds);
+ list_free(allocate);
+ }
+
+ return result;
+}
+
+
+/*
+ * Return handles involved into current transaction, to run commit or rollback
+ * on them, as requested.
+ * Transaction is not started on nodes when read-only statement is executed
+ * on it, so we do not have to commit or rollback on those nodes.
+ * Parameter should point to array able to store at least node_count pointers
+ * to a DataNodeHandle structure.
+ * The function returns number of pointers written to the connections array.
+ * Remaining items in the array, if any, will be kept unchanged
+ */
+static int
+get_transaction_nodes(DataNodeHandle ** connections)
+{
+ int tran_count = 0;
+ int i;
+
+ if (node_count)
+ {
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ if (handles[i].sock != NO_SOCKET && handles[i].transaction_status != 'I')
+ connections[tran_count++] = &handles[i];
+ }
+ }
+
+ return tran_count;
+}
+
+
+/*
+ * Called when the backend is ending.
+ */
+void
+DataNodeCleanAndRelease(int code, Datum arg)
+{
+ /* Rollback on Data Nodes */
+ if (IsTransactionState())
+ {
+ DataNodeRollback(DestNone);
+
+ /* Rollback on GTM if transaction id opened. */
+ RollbackTranGTM((GlobalTransactionId) GetCurrentTransactionIdIfAny());
+ }
+
+ /* Release data node connections */
+ release_handles();
+
+ /* Close connection with GTM */
+ CloseGTM();
+
+ /* Dump collected statistics to the log */
+ stat_log();
+}
diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c
new file mode 100644
index 0000000000..03b785f954
--- /dev/null
+++ b/src/backend/pgxc/pool/poolcomm.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolcomm.c
+ *
+ * Communication functions between the pool manager and session
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include "c.h"
+#include "pgxc/poolcomm.h"
+#include "utils/elog.h"
+#include "miscadmin.h"
+
+static int pool_recvbuf(PoolPort * port);
+static int pool_discardbytes(PoolPort * port, size_t len);
+
+#ifdef HAVE_UNIX_SOCKETS
+
+#define POOLER_UNIXSOCK_PATH(path, port, sockdir) \
+ snprintf(path, sizeof(path), "%s/.s.PGPOOL.%d", \
+ ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
+ DEFAULT_PGSOCKET_DIR, \
+ (port))
+
+static char sock_path[MAXPGPATH];
+
+static int Lock_AF_UNIX(unsigned short port, const char *unixSocketName);
+#endif
+
+/*
+ * Open server socket on specified port to accept connection from sessions
+ */
+int
+pool_listen(unsigned short port, const char *unixSocketName)
+{
+ int fd,
+ len;
+ struct sockaddr_un unix_addr;
+
+#ifdef HAVE_UNIX_SOCKETS
+ if (Lock_AF_UNIX(port, unixSocketName) < 0)
+ return -1;
+
+ /* create a Unix domain stream socket */
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+ return -1;
+
+ /* fill in socket address structure */
+ memset(&unix_addr, 0, sizeof(unix_addr));
+ unix_addr.sun_family = AF_UNIX;
+ strcpy(unix_addr.sun_path, sock_path);
+ len = sizeof(unix_addr.sun_family) +
+ strlen(unix_addr.sun_path) + 1;
+
+ /* bind the name to the descriptor */
+ if (bind(fd, (struct sockaddr *) & unix_addr, len) < 0)
+ return -1;
+
+ /* tell kernel we're a server */
+ if (listen(fd, 5) < 0)
+ return -1;
+
+ return fd;
+#else
+ /* TODO support for non-unix platform */
+ ereport(FATAL,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("pool manager only supports UNIX socket")));
+ return -1;
+#endif
+}
+
+#ifdef HAVE_UNIX_SOCKETS
+static int
+Lock_AF_UNIX(unsigned short port, const char *unixSocketName)
+{
+ POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName);
+
+ CreateSocketLockFile(sock_path, true);
+
+ unlink(sock_path);
+
+ return 0;
+}
+#endif
+
+/*
+ * Connect to pooler listening on specified port
+ */
+int
+pool_connect(unsigned short port, const char *unixSocketName)
+{
+ int fd,
+ len;
+ struct sockaddr_un unix_addr;
+
+#ifdef HAVE_UNIX_SOCKETS
+ /* create a Unix domain stream socket */
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+ return -1;
+
+ /* fill socket address structure w/server's addr */
+ POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName);
+
+ memset(&unix_addr, 0, sizeof(unix_addr));
+ unix_addr.sun_family = AF_UNIX;
+ strcpy(unix_addr.sun_path, sock_path);
+ len = sizeof(unix_addr.sun_family) +
+ strlen(unix_addr.sun_path) + 1;
+
+ if (connect(fd, (struct sockaddr *) & unix_addr, len) < 0)
+ return -1;
+
+ return fd;
+#else
+ /* TODO support for non-unix platform */
+ ereport(FATAL,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("pool manager only supports UNIX socket")));
+ return -1;
+#endif
+}
+
+
+/*
+ * Get one byte from the buffer, read data from the connection if buffer is empty
+ */
+int
+pool_getbyte(PoolPort * port)
+{
+ while (port->RecvPointer >= port->RecvLength)
+ {
+ if (pool_recvbuf(port)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+ return (unsigned char) port->RecvBuffer[port->RecvPointer++];
+}
+
+
+/*
+ * Get one byte from the buffer if it is not empty
+ */
+int
+pool_pollbyte(PoolPort * port)
+{
+ if (port->RecvPointer >= port->RecvLength)
+ {
+ return EOF; /* Empty buffer */
+ }
+ return (unsigned char) port->RecvBuffer[port->RecvPointer++];
+}
+
+
+/*
+ * Read pooler protocol message from the buffer.
+ */
+int
+pool_getmessage(PoolPort * port, StringInfo s, int maxlen)
+{
+ int32 len;
+
+ resetStringInfo(s);
+
+ /* Read message length word */
+ if (pool_getbytes(port, (char *) &len, 4) == EOF)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("unexpected EOF within message length word")));
+ return EOF;
+ }
+
+ len = ntohl(len);
+
+ if (len < 4 ||
+ (maxlen > 0 && len > maxlen))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("invalid message length")));
+ return EOF;
+ }
+
+ len -= 4; /* discount length itself */
+
+ if (len > 0)
+ {
+ /*
+ * Allocate space for message. If we run out of room (ridiculously
+ * large message), we will elog(ERROR)
+ */
+ PG_TRY();
+ {
+ enlargeStringInfo(s, len);
+ }
+ PG_CATCH();
+ {
+ if (pool_discardbytes(port, len) == EOF)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("incomplete message from client")));
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /* And grab the message */
+ if (pool_getbytes(port, s->data, len) == EOF)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("incomplete message from client")));
+ return EOF;
+ }
+ s->len = len;
+ /* Place a trailing null per StringInfo convention */
+ s->data[len] = '\0';
+ }
+
+ return 0;
+}
+
+
+/* --------------------------------
+ * pool_getbytes - get a known number of bytes from connection
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pool_getbytes(PoolPort * port, char *s, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ while (port->RecvPointer >= port->RecvLength)
+ {
+ if (pool_recvbuf(port)) /* If nothing in buffer, then recv
+ * some */
+ return EOF; /* Failed to recv data */
+ }
+ amount = port->RecvLength - port->RecvPointer;
+ if (amount > len)
+ amount = len;
+ memcpy(s, port->RecvBuffer + port->RecvPointer, amount);
+ port->RecvPointer += amount;
+ s += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+
+/* --------------------------------
+ * pool_discardbytes - discard a known number of bytes from connection
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pool_discardbytes(PoolPort * port, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ while (port->RecvPointer >= port->RecvLength)
+ {
+ if (pool_recvbuf(port)) /* If nothing in buffer, then recv
+ * some */
+ return EOF; /* Failed to recv data */
+ }
+ amount = port->RecvLength - port->RecvPointer;
+ if (amount > len)
+ amount = len;
+ port->RecvPointer += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+
+/* --------------------------------
+ * pool_recvbuf - load some bytes into the input buffer
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pool_recvbuf(PoolPort * port)
+{
+ if (port->RecvPointer > 0)
+ {
+ if (port->RecvLength > port->RecvPointer)
+ {
+ /* still some unread data, left-justify it in the buffer */
+ memmove(port->RecvBuffer, port->RecvBuffer + port->RecvPointer,
+ port->RecvLength - port->RecvPointer);
+ port->RecvLength -= port->RecvPointer;
+ port->RecvPointer = 0;
+ }
+ else
+ port->RecvLength = port->RecvPointer = 0;
+ }
+
+ /* Can fill buffer from PqRecvLength and upwards */
+ for (;;)
+ {
+ int r;
+
+ r = recv(Socket(*port), port->RecvBuffer + port->RecvLength,
+ POOL_BUFFER_SIZE - port->RecvLength, 0);
+
+ if (r < 0)
+ {
+ if (errno == EINTR)
+ continue; /* Ok if interrupted */
+
+ /*
+ * Report broken connection
+ */
+ ereport(LOG,
+ (errcode_for_socket_access(),
+ errmsg("could not receive data from client: %m")));
+ return EOF;
+ }
+ if (r == 0)
+ {
+ /*
+ * EOF detected. We used to write a log message here, but it's
+ * better to expect the ultimate caller to do that.
+ */
+ return EOF;
+ }
+ /* r contains number of bytes read, so just incr length */
+ port->RecvLength += r;
+ return 0;
+ }
+}
+
+
+/*
+ * Put a known number of bytes into the connection buffer
+ */
+int
+pool_putbytes(PoolPort * port, const char *s, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ /* If buffer is full, then flush it out */
+ if (port->SendPointer >= POOL_BUFFER_SIZE)
+ if (pool_flush(port))
+ return EOF;
+ amount = POOL_BUFFER_SIZE - port->SendPointer;
+ if (amount > len)
+ amount = len;
+ memcpy(port->SendBuffer + port->SendPointer, s, amount);
+ port->SendPointer += amount;
+ s += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+
+/* --------------------------------
+ * pool_flush - flush pending output
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pool_flush(PoolPort * port)
+{
+ static int last_reported_send_errno = 0;
+
+ char *bufptr = port->SendBuffer;
+ char *bufend = port->SendBuffer + port->SendPointer;
+
+ while (bufptr < bufend)
+ {
+ int r;
+
+ r = send(Socket(*port), bufptr, bufend - bufptr, 0);
+
+ if (r <= 0)
+ {
+ if (errno == EINTR)
+ continue; /* Ok if we were interrupted */
+
+ if (errno != last_reported_send_errno)
+ {
+ last_reported_send_errno = errno;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("could not send data to client: %m")));
+ }
+
+ /*
+ * We drop the buffered data anyway so that processing can
+ * continue, even though we'll probably quit soon.
+ */
+ port->SendPointer = 0;
+ return EOF;
+ }
+
+ last_reported_send_errno = 0; /* reset after any successful send */
+ bufptr += r;
+ }
+
+ port->SendPointer = 0;
+ return 0;
+}
+
+
+/*
+ * Put the pooler protocol message into the connection buffer
+ */
+int
+pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len)
+{
+ uint n32;
+
+ if (pool_putbytes(port, &msgtype, 1))
+ return EOF;
+
+ n32 = htonl((uint32) (len + 4));
+ if (pool_putbytes(port, (char *) &n32, 4))
+ return EOF;
+
+ if (pool_putbytes(port, s, len))
+ return EOF;
+
+ return 0;
+}
+
+/* message code('f'), size(8), node_count */
+#define SEND_MSG_BUFFER_SIZE 9
+
+
+/*
+ * Build up a message carrying file deskriptors and send them over specified
+ * connection
+ */
+int
+pool_sendfds(PoolPort * port, int *fds, int count)
+{
+ struct iovec iov[1];
+ struct msghdr msg;
+ char buf[SEND_MSG_BUFFER_SIZE];
+ uint n32;
+ int controllen = sizeof(struct cmsghdr) + count * sizeof(int);
+ struct cmsghdr *cmptr = NULL;
+
+ buf[0] = 'f';
+ n32 = htonl((uint32) 8);
+ memcpy(buf + 1, &n32, 4);
+ n32 = htonl((uint32) count);
+ memcpy(buf + 5, &n32, 4);
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = SEND_MSG_BUFFER_SIZE;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ if (count == 0)
+ {
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ }
+ else
+ {
+ if ((cmptr = malloc(controllen)) == NULL)
+ return EOF;
+ cmptr->cmsg_level = SOL_SOCKET;
+ cmptr->cmsg_type = SCM_RIGHTS;
+ cmptr->cmsg_len = controllen;
+ msg.msg_control = (caddr_t) cmptr;
+ msg.msg_controllen = controllen;
+ /* the fd to pass */
+ memcpy(CMSG_DATA(cmptr), fds, count * sizeof(int));
+ }
+
+ if (sendmsg(Socket(*port), &msg, 0) != SEND_MSG_BUFFER_SIZE)
+ {
+ if (cmptr)
+ free(cmptr);
+ return EOF;
+ }
+
+ if (cmptr)
+ free(cmptr);
+
+ return 0;
+}
+
+
+/*
+ * Read a message from the specified connection carrying file descriptors
+ */
+int
+pool_recvfds(PoolPort * port, int *fds, int count)
+{
+ int r;
+ uint n32;
+ char buf[SEND_MSG_BUFFER_SIZE];
+ struct iovec iov[1];
+ struct msghdr msg;
+ int controllen = sizeof(struct cmsghdr) + count * sizeof(int);
+ struct cmsghdr *cmptr = malloc(controllen);
+
+ if (cmptr == NULL)
+ return EOF;
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = SEND_MSG_BUFFER_SIZE;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = (caddr_t) cmptr;
+ msg.msg_controllen = controllen;
+
+ r = recvmsg(Socket(*port), &msg, 0);
+ if (r < 0)
+ {
+ /*
+ * Report broken connection
+ */
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("could not receive data from client: %m")));
+ goto failure;
+ }
+ else if (r == 0)
+ {
+ goto failure;
+ }
+ else if (r != SEND_MSG_BUFFER_SIZE)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("incomplete message from client")));
+ goto failure;
+ }
+
+ /* Verify response */
+ if (buf[0] != 'f')
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("unexpected message code")));
+ goto failure;
+ }
+
+ memcpy(&n32, buf + 1, 4);
+ n32 = ntohl(n32);
+ if (n32 != 8)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("invalid message size")));
+ goto failure;
+ }
+
+ /*
+ * If connection count is 0 it means pool does not have connections
+ * to fulfill request. Otherwise number of returned connections
+ * should be equal to requested count. If it not the case consider this
+ * a protocol violation. (Probably connection went out of sync)
+ */
+ memcpy(&n32, buf + 5, 4);
+ n32 = ntohl(n32);
+ if (n32 == 0)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("failed to acquire connections")));
+ goto failure;
+ }
+
+ if (n32 != count)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("unexpected connection count")));
+ goto failure;
+ }
+
+ memcpy(fds, CMSG_DATA(cmptr), count * sizeof(int));
+ free(cmptr);
+ return 0;
+failure:
+ free(cmptr);
+ return EOF;
+}
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
new file mode 100644
index 0000000000..02e5ddd5cd
--- /dev/null
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -0,0 +1,1403 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolmgr.c
+ *
+ * Connection pool manager handles connections to DataNodes
+ *
+ * The pooler runs as a separate process and is forked off from a
+ * coordinator postmaster. If the coordinator needs a connection from a
+ * data node, it asks for one from the pooler, which maintains separate
+ * pools for each data node. A group of connections can be requested in
+ * a single request, and the pooler returns a list of file descriptors
+ * to use for the connections.
+ *
+ * Note the current implementation does not yet shrink the pool over time
+ * as connections are idle. Also, it does not queue requests; if a
+ * connection is unavailable, it will simply fail. This should be implemented
+ * one day, although there is a chance for deadlocks. For now, limiting
+ * connections should be done between the application and coordinator.
+ * Still, this is useful to avoid having to re-establish connections to the
+ * data nodes all the time for multiple coordinator backend sessions.
+ *
+ * The term "agent" here refers to a session manager, one for each backend
+ * coordinator connection to the pooler. It will contain a list of connections
+ * allocated to a session, at most one per data node.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <signal.h>
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgxc/poolmgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "pgxc/locator.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#include "postmaster/postmaster.h" /* For UnixSocketDir */
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+/* Configuration options */
+int NumDataNodes = 2;
+int MinPoolSize = 1;
+int MaxPoolSize = 100;
+int PoolerPort = 6667;
+
+bool PersistentConnections = false;
+
+/* The memory context */
+static MemoryContext PoolerMemoryContext = NULL;
+
+/* Connection info */
+char *DataNodeHosts = NULL;
+char *DataNodePorts = NULL;
+char *DataNodeUsers = NULL;
+char *DataNodePwds = NULL;
+
+/* Connection info list */
+static DataNodeConnectionInfo *connectionInfos;
+
+/* Pool to all the databases (linked list) */
+static DatabasePool *databasePools = NULL;
+
+/* PoolAgents */
+static int agentCount = 0;
+static PoolAgent **poolAgents;
+
+static PoolHandle *Handle = NULL;
+
+static int server_fd = -1;
+
+static void agent_init(PoolAgent * agent, const char *database, List *nodes);
+static void agent_destroy(PoolAgent * agent);
+static void agent_create(void);
+static void agent_handle_input(PoolAgent * agent, StringInfo s);
+static DatabasePool *create_database_pool(const char *database, List *nodes);
+static void insert_database_pool(DatabasePool * pool);
+static int destroy_database_pool(const char *database);
+static DatabasePool *find_database_pool(const char *database);
+static DatabasePool *remove_database_pool(const char *database);
+static int *agent_acquire_connections(PoolAgent * agent, List *nodelist);
+static DataNodePoolSlot *acquire_connection(DatabasePool * dbPool, int node);
+static void agent_release_connections(PoolAgent * agent, bool clean);
+static void release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean);
+static void destroy_slot(DataNodePoolSlot * slot);
+static void grow_pool(DatabasePool * dbPool, int index);
+static void destroy_node_pool(DataNodePool * node_pool);
+static void PoolerLoop(void);
+
+/* Signal handlers */
+static void pooler_die(SIGNAL_ARGS);
+static void pooler_quickdie(SIGNAL_ARGS);
+
+/* Check status of connection */
+extern int pqReadReady(PGconn * conn);
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t shutdown_requested = false;
+
+
+/*
+ * Initialize internal structures
+ */
+int
+PoolManagerInit()
+{
+ char *rawstring;
+ List *elemlist;
+ ListCell *l;
+ int i;
+ MemoryContext old_context;
+
+ elog(DEBUG1, "Pooler process is started: %d", getpid());
+
+ /*
+ * Set up memory context for the pooler
+ */
+ PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext,
+ "PoolerMemoryContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ /*
+ * If possible, make this process a group leader, so that the postmaster
+ * can signal any child processes too. (pool manager probably never has any
+ * child processes, but for consistency we make all postmaster child
+ * processes do this.)
+ */
+#ifdef HAVE_SETSID
+ if (setsid() < 0)
+ elog(FATAL, "setsid() failed: %m");
+#endif
+ /*
+ * Properly accept or ignore signals the postmaster might send us
+ */
+ pqsignal(SIGINT, pooler_die);
+ pqsignal(SIGTERM, pooler_die);
+ pqsignal(SIGQUIT, pooler_quickdie);
+ pqsignal(SIGHUP, SIG_IGN);
+ /* TODO other signal handlers */
+
+ /* We allow SIGQUIT (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+ sigdelset(&BlockSig, SIGQUIT);
+#else
+ BlockSig &= ~(sigmask(SIGQUIT));
+#endif
+
+ /*
+ * Unblock signals (they were blocked when the postmaster forked us)
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ /* Allocate pooler structures in the Pooler context */
+ old_context = MemoryContextSwitchTo(PoolerMemoryContext);
+
+ poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
+ if (poolAgents == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ connectionInfos = (DataNodeConnectionInfo *) palloc(NumDataNodes * sizeof(DataNodeConnectionInfo));
+ if (connectionInfos == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* Need a modifiable copy */
+ rawstring = pstrdup(DataNodeHosts);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_hosts\"")));
+ }
+
+ i = 0;
+ foreach(l, elemlist)
+ {
+ char *curhost = (char *) lfirst(l);
+
+ connectionInfos[i].host = pstrdup(curhost);
+ if (connectionInfos[i].host == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ /* Ignore extra entries, if any */
+ if (++i == NumDataNodes)
+ break;
+ }
+ list_free(elemlist);
+ pfree(rawstring);
+
+ /* Validate */
+ if (i == 0)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_hosts\"")));
+ }
+ else if (i == 1)
+ {
+ /* Copy all values from first */
+ for (; i < NumDataNodes; i++)
+ {
+ connectionInfos[i].host = pstrdup(connectionInfos[0].host);
+ if (connectionInfos[i].host == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+ }
+ else if (i < NumDataNodes)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_hosts\"")));
+ }
+
+ /* Need a modifiable copy */
+ rawstring = pstrdup(DataNodePorts);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_ports\"")));
+ }
+
+ i = 0;
+ foreach(l, elemlist)
+ {
+ char *curport = (char *) lfirst(l);
+
+ connectionInfos[i].port = pstrdup(curport);
+ if (connectionInfos[i].port == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ /* Ignore extra entries, if any */
+ if (++i == NumDataNodes)
+ break;
+ }
+ list_free(elemlist);
+ pfree(rawstring);
+
+ /* Validate */
+ if (i == 0)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_ports\"")));
+ }
+ else if (i == 1)
+ {
+ /* Copy all values from first */
+ for (; i < NumDataNodes; i++)
+ {
+ connectionInfos[i].port = pstrdup(connectionInfos[0].port);
+ if (connectionInfos[i].port == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+ }
+ else if (i < NumDataNodes)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_ports\"")));
+ }
+
+ rawstring = pstrdup(DataNodeUsers);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_users\"")));
+ }
+
+ i = 0;
+ foreach(l, elemlist)
+ {
+ char *curuser = (char *) lfirst(l);
+
+ connectionInfos[i].uname = pstrdup(curuser);
+ if (connectionInfos[i].uname == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ /* Ignore extra entries, if any */
+ if (++i == NumDataNodes)
+ break;
+ }
+ list_free(elemlist);
+ pfree(rawstring);
+
+ /* Validate */
+ if (i == 0)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_users\"")));
+ }
+ else if (i == 1)
+ {
+ /* Copy all values from first */
+ for (; i < NumDataNodes; i++)
+ {
+ connectionInfos[i].uname = pstrdup(connectionInfos[0].uname);
+ if (connectionInfos[i].uname == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+ }
+ else if (i < NumDataNodes)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_users\"")));
+ }
+
+ rawstring = pstrdup(DataNodePwds);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_passwords\"")));
+ }
+
+ i = 0;
+ foreach(l, elemlist)
+ {
+ char *curpassword = (char *) lfirst(l);
+
+ connectionInfos[i].password = pstrdup(curpassword);
+ if (connectionInfos[i].password == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ /* Ignore extra entries, if any */
+ if (++i == NumDataNodes)
+ break;
+ }
+ list_free(elemlist);
+ pfree(rawstring);
+
+ /* Validate */
+ if (i == 0)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_passwords\"")));
+ }
+ else if (i == 1)
+ {
+ /* Copy all values from first */
+ for (; i < NumDataNodes; i++)
+ {
+ connectionInfos[i].password = pstrdup(connectionInfos[0].password);
+ if (connectionInfos[i].password == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+ }
+ else if (i < NumDataNodes)
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"data_node_passwords\"")));
+ }
+
+ PoolerLoop();
+ return 0;
+}
+
+
+/*
+ * Destroy internal structures
+ */
+int
+PoolManagerDestroy(void)
+{
+ int status = 0;
+
+ if (PoolerMemoryContext)
+ {
+ MemoryContextDelete(PoolerMemoryContext);
+ PoolerMemoryContext = NULL;
+ }
+
+ return status;
+}
+
+
+/*
+ * Get handle to pool manager
+ * Invoked from Postmaster's main loop just before forking off new session
+ * Returned PoolHandle structure will be inherited by session process
+ */
+PoolHandle *
+GetPoolManagerHandle(void)
+{
+ PoolHandle *handle;
+ int fdsock;
+
+ /* Connect to the pooler */
+ fdsock = pool_connect(PoolerPort, UnixSocketDir);
+ if (fdsock < 0)
+ {
+ int saved_errno = errno;
+
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("failed to connect to pool manager: %m")));
+ errno = saved_errno;
+ return NULL;
+ }
+
+ /* Allocate handle */
+ /*
+ * XXX we may change malloc here to palloc but first ensure
+ * the CurrentMemoryContext is properly set.
+ * The handle allocated just before new session is forked off and
+ * inherited by the session process. It should remain valid for all
+ * the session lifetime.
+ */
+ handle = (PoolHandle *) malloc(sizeof(PoolHandle));
+ if (!handle)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return NULL;
+ }
+
+ handle->port.fdsock = fdsock;
+ handle->port.RecvLength = 0;
+ handle->port.RecvPointer = 0;
+ handle->port.SendPointer = 0;
+
+ return handle;
+}
+
+
+/*
+ * Close handle
+ */
+void
+PoolManagerCloseHandle(PoolHandle * handle)
+{
+ close(Socket(handle->port));
+ free(handle);
+}
+
+
+/*
+ * Create agent
+ */
+static void
+agent_create(void)
+{
+ int new_fd;
+ PoolAgent *agent;
+
+ new_fd = accept(server_fd, NULL, NULL);
+ if (new_fd < 0)
+ {
+ int saved_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("pool manager failed to accept connection: %m")));
+ errno = saved_errno;
+ return;
+ }
+
+ /* Allocate agent */
+ agent = (PoolAgent *) palloc(sizeof(PoolAgent));
+ if (!agent)
+ {
+ close(new_fd);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return;
+ }
+
+ agent->port.fdsock = new_fd;
+ agent->port.RecvLength = 0;
+ agent->port.RecvPointer = 0;
+ agent->port.SendPointer = 0;
+ agent->pool = NULL;
+ agent->connections = NULL;
+
+ /* Append new agent to the list */
+ poolAgents[agentCount++] = agent;
+}
+
+
+/*
+ * Associate session with specified database and respective connection pool
+ * Invoked from Session process
+ */
+void
+PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes)
+{
+ Assert(Handle);
+ Assert(database);
+
+ /* Save the handle */
+ Handle = handle;
+
+ /* Send database name followed by \0 terminator */
+ pool_putmessage(&handle->port, 'c', database, strlen(database) + 1);
+ pool_flush(&handle->port);
+}
+
+
+/*
+ * Init PoolAgent
+*/
+static void
+agent_init(PoolAgent * agent, const char *database, List *nodes)
+{
+ Assert(agent);
+ Assert(database);
+ Assert(list_length(nodes) > 0);
+
+ /* disconnect if we still connected */
+ if (agent->pool)
+ agent_release_connections(agent, false);
+
+ /* find database */
+ agent->pool = find_database_pool(database);
+
+ /* create if not found */
+ if (agent->pool == NULL)
+ agent->pool = create_database_pool(database, nodes);
+}
+
+
+/*
+ * Destroy PoolAgent
+ */
+static void
+agent_destroy(PoolAgent * agent)
+{
+ int i;
+
+ Assert(agent);
+
+ close(Socket(agent->port));
+
+ /* Discard connections if any remaining */
+ if (agent->pool)
+ agent_release_connections(agent, false);
+
+ /* find agent in the list */
+ for (i = 0; i < agentCount; i++)
+ {
+ if (poolAgents[i] == agent)
+ {
+ /* free memory */
+ if (agent->connections)
+ {
+ pfree(agent->connections);
+ agent->connections = NULL;
+ }
+ pfree(agent);
+ /* shrink the list and move last agent into the freed slot */
+ if (i < --agentCount)
+ poolAgents[i] = poolAgents[agentCount];
+ /* only one match is expected so exit */
+ break;
+ }
+ }
+}
+
+
+/*
+ * Release handle to pool manager
+ */
+void
+PoolManagerDisconnect(PoolHandle * handle)
+{
+ Assert(handle);
+
+ pool_putmessage(&handle->port, 'd', NULL, 0);
+ pool_flush(&Handle->port);
+
+ close(Socket(handle->port));
+
+ pfree(handle);
+}
+
+
+/*
+ * Get pooled connections
+ */
+int *
+PoolManagerGetConnections(List *nodelist)
+{
+ int i;
+ ListCell *nodelist_item;
+ int *fds;
+ int nodes[list_length(nodelist) + 1];
+
+ Assert(Handle);
+ Assert(list_length(nodelist) > 0);
+
+ /* Prepare end send message to pool manager */
+ nodes[0] = htonl(list_length(nodelist));
+ i = 1;
+ foreach(nodelist_item, nodelist)
+ {
+ nodes[i++] = htonl(nodelist_item->data.int_value);
+ }
+ pool_putmessage(&Handle->port, 'g', (char *) nodes, sizeof(int) * (list_length(nodelist) + 1));
+ pool_flush(&Handle->port);
+ /* Receive response */
+ fds = (int *) palloc(sizeof(int) * list_length(nodelist));
+ if (fds == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ if (pool_recvfds(&Handle->port, fds, list_length(nodelist)))
+ {
+ pfree(fds);
+ return NULL;
+ }
+ return fds;
+}
+
+
+/*
+ * Handle messages to agent
+ */
+static void
+agent_handle_input(PoolAgent * agent, StringInfo s)
+{
+ int qtype;
+ const char *database;
+ int nodecount;
+ List *nodelist = NIL;
+ int *fds;
+ int i;
+
+ qtype = pool_getbyte(&agent->port);
+ /*
+ * We can have multiple messages, so handle them all
+ */
+ for (;;)
+ {
+ switch (qtype)
+ {
+ case 'c': /* CONNECT */
+ pool_getmessage(&agent->port, s, 0);
+ database = pq_getmsgstring(s);
+ agent_init(agent, database, GetAllNodes());
+ pq_getmsgend(s);
+ break;
+ case 'd': /* DISCONNECT */
+ pool_getmessage(&agent->port, s, 4);
+ agent_destroy(agent);
+ pq_getmsgend(s);
+ break;
+ case 'g': /* GET CONNECTIONS */
+ pool_getmessage(&agent->port, s, 4 * NumDataNodes + 8);
+ nodecount = pq_getmsgint(s, 4);
+ for (i = 0; i < nodecount; i++)
+ {
+ nodelist = lappend_int(nodelist, pq_getmsgint(s, 4));
+ }
+ pq_getmsgend(s);
+ /*
+ * In case of error agent_acquire_connections will log
+ * the error and return NULL
+ */
+ fds = agent_acquire_connections(agent, nodelist);
+ list_free(nodelist);
+ pool_sendfds(&agent->port, fds, fds ? nodecount : 0);
+ if (fds)
+ pfree(fds);
+ break;
+ case 'r': /* RELEASE CONNECTIONS */
+ pool_getmessage(&agent->port, s, 4);
+ pq_getmsgend(s);
+ agent_release_connections(agent, true);
+ break;
+ default: /* EOF or protocol violation */
+ agent_destroy(agent);
+ return;
+ }
+ /* avoid reading from connection */
+ if ((qtype = pool_pollbyte(&agent->port)) == EOF)
+ break;
+ }
+}
+
+
+/*
+ * acquire connection
+ */
+static int *
+agent_acquire_connections(PoolAgent * agent, List *nodelist)
+{
+ int i;
+ int *result;
+ ListCell *nodelist_item;
+
+ Assert(agent);
+ Assert(nodelist);
+
+ /* Allocate memory */
+ result = (int *) palloc(list_length(nodelist) * sizeof(int));
+ if (result == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* initialize connection if it is not initialized yet */
+ if (!agent->connections)
+ {
+ agent->connections = (DataNodePoolSlot **) palloc(NumDataNodes * sizeof(DataNodePoolSlot *));
+ if (!agent->connections)
+ {
+ pfree(result);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return NULL;
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ agent->connections[i] = NULL;
+ }
+
+ /* Initialize result */
+ i = 0;
+ foreach(nodelist_item, nodelist)
+ {
+ int node = nodelist_item->data.int_value;
+
+ /* Acquire from the pool if none */
+ if (agent->connections[node - 1] == NULL)
+ {
+ DataNodePoolSlot *slot = acquire_connection(agent->pool, node);
+
+ /* Handle failure */
+ if (slot == NULL)
+ {
+ pfree(result);
+ return NULL;
+ }
+
+ /* Store in the descriptor */
+ agent->connections[node - 1] = slot;
+ }
+
+ result[i++] = PQsocket((PGconn *) agent->connections[node - 1]->conn);
+ }
+
+ return result;
+}
+
+
+/*
+ * Retun connections back to the pool
+ */
+void
+PoolManagerReleaseConnections()
+{
+ Assert(Handle);
+
+ pool_putmessage(&Handle->port, 'r', NULL, 0);
+ pool_flush(&Handle->port);
+}
+
+
+/*
+ * Release connections
+ */
+static void
+agent_release_connections(PoolAgent * agent, bool clean)
+{
+ int i;
+
+ if (!agent->connections)
+ return;
+
+ /* Enumerate connections */
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ DataNodePoolSlot *slot;
+
+ slot = agent->connections[i];
+
+ /* Release connection */
+ if (slot)
+ {
+ release_connection(agent->pool, slot, i, clean);
+ }
+ agent->connections[i] = NULL;
+ }
+}
+
+
+/*
+ * Create new empty pool for a database and insert into the list
+ * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
+ * error and POOL_WEXIST if poll for this database already exist
+ */
+static DatabasePool *
+create_database_pool(const char *database, List *nodes)
+{
+ DatabasePool *databasePool;
+ int i;
+ ListCell *l;
+
+ Assert(nodes && nodes->length > 0);
+
+ /* check if exist */
+ databasePool = find_database_pool(database);
+ if (databasePool)
+ {
+ /* already exist */
+ return databasePool;
+ }
+
+ /* Allocate memory */
+ databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+ if (!databasePool)
+ {
+ /* out of memory */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return NULL;
+ }
+
+ /* Copy the database name */ ;
+ databasePool->database = pstrdup(database);
+ if (!databasePool->database)
+ {
+ /* out of memory */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ pfree(databasePool);
+ return NULL;
+ }
+
+ /* Init next reference */
+ databasePool->next = NULL;
+
+ /* Init data node pools */
+ databasePool->nodePools = (DataNodePool **) palloc(NumDataNodes * sizeof(DataNodePool **));
+ if (!databasePool->nodePools)
+ {
+ /* out of memory */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ pfree(databasePool->database);
+ pfree(databasePool);
+ return NULL;
+ }
+ for (i = 0; i < NumDataNodes; i++)
+ databasePool->nodePools[i] = NULL;
+
+ foreach(l, nodes)
+ {
+ int nodeid = lfirst_int(l);
+
+ grow_pool(databasePool, nodeid - 1);
+ }
+
+ /* Insert into the list */
+ insert_database_pool(databasePool);
+
+ return databasePool;
+}
+
+
+/*
+ * Destroy the pool and free memory
+ */
+static int
+destroy_database_pool(const char *database)
+{
+ DatabasePool *databasePool;
+ int i;
+
+ /* Delete from the list */
+ databasePool = remove_database_pool(database);
+ if (databasePool)
+ {
+ if (databasePool->nodePools)
+ {
+ for (i = 0; i < NumDataNodes; i++)
+ if (databasePool->nodePools[i])
+ destroy_node_pool(databasePool->nodePools[i]);
+ pfree(databasePool->nodePools);
+ }
+ /* free allocated memory */
+ pfree(databasePool->database);
+ pfree(databasePool);
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
+ * Insert new database pool to the list
+ */
+static void
+insert_database_pool(DatabasePool * databasePool)
+{
+ Assert(databasePool);
+
+ /* Reference existing list or null the tail */
+ if (databasePools)
+ databasePool->next = databasePools;
+ else
+ databasePool->next = NULL;
+
+ /* Update head pointer */
+ databasePools = databasePool;
+}
+
+
+/*
+ * Find pool for specified database in the list
+ */
+static DatabasePool
+*
+find_database_pool(const char *database)
+{
+ DatabasePool *databasePool;
+
+ /* Scan the list */
+ databasePool = databasePools;
+ while (databasePool)
+ {
+
+ /* if match break the loop and return */
+ if (strcmp(database, databasePool->database) == 0)
+ break;
+ databasePool = databasePool->next;
+
+ }
+ return databasePool;
+}
+
+
+/*
+ * Remove pool for specified database from the list
+ */
+static DatabasePool
+*
+remove_database_pool(const char *database)
+{
+ DatabasePool *databasePool,
+ *prev;
+
+ /* Scan the list */
+ databasePool = databasePools;
+ prev = NULL;
+ while (databasePool)
+ {
+
+ /* if match break the loop and return */
+ if (strcmp(database, databasePool->database) == 0)
+ break;
+ prev = databasePool;
+ databasePool = databasePool->next;
+ }
+
+ /* if found */
+ if (databasePool)
+ {
+
+ /* Remove entry from chain or update head */
+ if (prev)
+ prev->next = databasePool->next;
+ else
+ databasePools = databasePool->next;
+
+
+ databasePool->next = NULL;
+ }
+ return databasePool;
+}
+
+/*
+ * Acquire connection
+ */
+static DataNodePoolSlot *
+acquire_connection(DatabasePool * dbPool, int node)
+{
+ DataNodePool *nodePool;
+ DataNodePoolSlot *slot;
+
+ Assert(dbPool);
+ Assert(0 <= node && node < NumDataNodes);
+
+ slot = NULL;
+ /* Find referenced node pool */
+ nodePool = dbPool->nodePools[node - 1];
+ if (nodePool == NULL || nodePool->freeSize == 0)
+ {
+ grow_pool(dbPool, node - 1);
+ nodePool = dbPool->nodePools[node - 1];
+ }
+
+ /* Check available connections */
+ if (nodePool && nodePool->freeSize > 0)
+ {
+ int poll_result;
+
+ while (nodePool->freeSize > 0)
+ {
+ slot = nodePool->slot[--(nodePool->freeSize)];
+
+ retry:
+ /* Make sure connection is ok */
+ poll_result = pqReadReady(slot->conn);
+
+ if (poll_result == 0)
+ {
+ /* ok, no data */
+ break;
+ }
+ else if (poll_result < 0)
+ {
+ if (errno == EAGAIN || errno == EINTR)
+ goto retry;
+
+ elog(WARNING, "Error in checking connection, errno = %d", errno);
+ }
+ else
+ {
+ elog(WARNING, "Unexpected data on connection, cleaning.");
+ }
+
+ destroy_slot(slot);
+ /* Decrement current max pool size */
+ (nodePool->size)--;
+ /* Ensure we are not below minimum size */
+ grow_pool(dbPool, node - 1);
+ }
+ }
+ else
+ {
+ /* report problem */
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("connection pool is empty")));
+ }
+ return slot;
+}
+
+
+/*
+ * release connection from specified pool and slot
+ */
+static void
+release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean)
+{
+ DataNodePool *nodePool;
+
+ Assert(dbPool);
+ Assert(slot);
+ Assert(0 <= index && index < NumDataNodes);
+
+ /* Find referenced node pool */
+ nodePool = dbPool->nodePools[index];
+ if (nodePool == NULL)
+ {
+ /* report problem */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("database does not use node %d", (index + 1))));
+ return;
+ }
+
+ /* return or discard */
+ if (clean)
+ {
+ /* Insert the slot into the array and increase pool size */
+ nodePool->slot[(nodePool->freeSize)++] = slot;
+ }
+ else
+ {
+ elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+ destroy_slot(slot);
+ /* Decrement pool size */
+ (nodePool->size)--;
+ /* Ensure we are not below minimum size */
+ grow_pool(dbPool, index);
+ }
+}
+
+
+/*
+ * Increase database pool size
+ */
+static void
+grow_pool(DatabasePool * dbPool, int index)
+{
+ DataNodePool *nodePool;
+
+ Assert(dbPool);
+ Assert(0 <= index && index < NumDataNodes);
+
+ /* Find referenced node pool */
+ nodePool = dbPool->nodePools[index];
+ if (!nodePool)
+ {
+ /* Allocate new DBNode Pool */
+ nodePool = (DataNodePool *) palloc(sizeof(DataNodePool));
+ if (!nodePool)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* initialize it */
+ nodePool->connstr = DataNodeConnStr(
+ connectionInfos[index].host,
+ connectionInfos[index].port,
+ dbPool->database,
+ connectionInfos[index].uname,
+ connectionInfos[index].password);
+
+ if (!nodePool->connstr)
+ {
+ pfree(nodePool);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ nodePool->slot = (DataNodePoolSlot **) palloc(MaxPoolSize * sizeof(DataNodePoolSlot *));
+ if (!nodePool->slot)
+ {
+ pfree(nodePool);
+ pfree(nodePool->connstr);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ memset(nodePool->slot, 0, MaxPoolSize * sizeof(DataNodePoolSlot *));
+ nodePool->freeSize = 0;
+ nodePool->size = 0;
+
+ /* and insert into the array */
+ dbPool->nodePools[index] = nodePool;
+ }
+
+ while (nodePool->size < MinPoolSize || (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize))
+ {
+ DataNodePoolSlot *slot;
+
+ /* Allocate new slot */
+ slot = (DataNodePoolSlot *) palloc(sizeof(DataNodePoolSlot));
+ if (slot == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* Establish connection */
+ slot->conn = DataNodeConnect(nodePool->connstr);
+ if (!DataNodeConnected(slot->conn))
+ {
+ destroy_slot(slot);
+ ereport(LOG,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("failed to connect to data node")));
+ break;
+ }
+
+ /* Insert at the end of the pool */
+ nodePool->slot[(nodePool->freeSize)++] = slot;
+
+ /* Increase count of pool size */
+ (nodePool->size)++;
+ elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+ nodePool->size,
+ nodePool->connstr);
+ }
+}
+
+
+/*
+ * Destroy pool slot
+ */
+static void
+destroy_slot(DataNodePoolSlot * slot)
+{
+ DataNodeClose(slot->conn);
+ pfree(slot);
+}
+
+
+/*
+ * Destroy node pool
+ */
+static void
+destroy_node_pool(DataNodePool * node_pool)
+{
+ int i;
+
+ /*
+ * At this point all agents using connections from this pool should be already closed
+ * If this not the connections to the data nodes assigned to them remain open, this will
+ * consume data node resources.
+ * I believe this is not the case because pool is only destroyed on coordinator shutdown.
+ * However we should be careful when changing thinds
+ */
+ elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
+ node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+ if (node_pool->connstr)
+ pfree(node_pool->connstr);
+
+ if (node_pool->slot)
+ {
+ for (i = 0; i < node_pool->freeSize; i++)
+ destroy_slot(node_pool->slot[i]);
+ pfree(node_pool->slot);
+ }
+}
+
+
+/*
+ * Main handling loop
+ */
+static void
+PoolerLoop(void)
+{
+ StringInfoData input_message;
+
+ server_fd = pool_listen(PoolerPort, UnixSocketDir);
+ if (server_fd == -1)
+ {
+ /* log error */
+ return;
+ }
+ initStringInfo(&input_message);
+ for (;;)
+ {
+ int nfds;
+ fd_set rfds;
+ int retval;
+ int i;
+
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (!PostmasterIsAlive(true))
+ exit(1);
+
+ /* watch for incoming connections */
+ FD_ZERO(&rfds);
+ FD_SET (server_fd, &rfds);
+
+ nfds = server_fd;
+
+ /* watch for incoming messages */
+ for (i = 0; i < agentCount; i++)
+ {
+ PoolAgent *agent = poolAgents[i];
+ int sockfd = Socket(agent->port);
+ FD_SET (sockfd, &rfds);
+
+ nfds = Max(nfds, sockfd);
+ }
+
+ /* wait for event */
+ retval = select(nfds + 1, &rfds, NULL, NULL, NULL);
+ if (shutdown_requested)
+ {
+ for (i = agentCount - 1; i >= 0; i--)
+ {
+ PoolAgent *agent = poolAgents[i];
+
+ agent_destroy(agent);
+ }
+ while (databasePools)
+ if (destroy_database_pool(databasePools->database) == 0)
+ break;
+ close(server_fd);
+ exit(0);
+ }
+ if (retval > 0)
+ {
+ /*
+ * Agent may be removed from the array while processing
+ * and trailing items are shifted, so scroll downward
+ * to avoid problem
+ */
+ for (i = agentCount - 1; i >= 0; i--)
+ {
+ PoolAgent *agent = poolAgents[i];
+ int sockfd = Socket(agent->port);
+
+ if (FD_ISSET(sockfd, &rfds))
+ agent_handle_input(agent, &input_message);
+ }
+ if (FD_ISSET(server_fd, &rfds))
+ agent_create();
+ }
+ }
+}
+
+
+/*
+ *
+ */
+static void
+pooler_die(SIGNAL_ARGS)
+{
+ shutdown_requested = true;
+}
+
+
+/*
+ *
+ */
+static void
+pooler_quickdie(SIGNAL_ARGS)
+{
+ PG_SETMASK(&BlockSig);
+ exit(2);
+}
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 3dbf36a6cf..0dd252cb62 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -34,6 +34,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -102,6 +103,13 @@
#include "libpq/libpq.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+/* COORD */
+#include "pgxc/locator.h"
+#include "pgxc/poolmgr.h"
+#include "access/gtm.h"
+#endif
#include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "postmaster/fork_process.h"
@@ -204,6 +212,9 @@ char *bonjour_name;
/* PIDs of special child processes; 0 when not running */
static pid_t StartupPID = 0,
+#ifdef PGXC /* PGXC_COORD */
+ PgPoolerPID = 0,
+#endif /* PGXC_COORD */
BgWriterPID = 0,
WalWriterPID = 0,
AutoVacPID = 0,
@@ -442,6 +453,12 @@ static void ShmemBackendArrayAdd(Backend *bn);
static void ShmemBackendArrayRemove(Backend *bn);
#endif /* EXEC_BACKEND */
+#ifdef PGXC /* PGXC_COORD */
+bool isPGXCCoordinator = false;
+bool isPGXCDataNode = false;
+#define StartPoolManager() StartChildProcess(PoolerProcess)
+#endif
+
#define StartupDataBase() StartChildProcess(StartupProcess)
#define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
#define StartWalWriter() StartChildProcess(WalWriterProcess)
@@ -461,6 +478,9 @@ PostmasterMain(int argc, char *argv[])
int status;
char *userDoption = NULL;
int i;
+#ifdef PGXC /* PGXC_COORD */
+ MemoryContext oldcontext;
+#endif
MyProcPid = PostmasterPid = getpid();
@@ -506,7 +526,11 @@ PostmasterMain(int argc, char *argv[])
* tcop/postgres.c (the option sets should not conflict) and with the
* common help() function in main/main.c.
*/
+#ifdef PGXC
+ while ((opt = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:X-:")) != -1)
+#else
while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1)
+#endif
{
switch (opt)
{
@@ -517,6 +541,11 @@ PostmasterMain(int argc, char *argv[])
case 'B':
SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV);
break;
+#ifdef PGXC
+ case 'C':
+ isPGXCCoordinator = true;
+ break;
+#endif
case 'D':
userDoption = optarg;
@@ -638,6 +667,11 @@ PostmasterMain(int argc, char *argv[])
SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV);
break;
+#ifdef PGXC
+ case 'X':
+ isPGXCDataNode = true;
+ break;
+#endif
case 'c':
case '-':
{
@@ -673,6 +707,14 @@ PostmasterMain(int argc, char *argv[])
}
}
+#ifdef PGXC
+ if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE)
+ {
+ write_stderr("%s: PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n",
+ progname);
+ ExitPostmaster(1);
+ }
+#endif
/*
* Postmaster accepts no non-option switch arguments.
*/
@@ -1037,6 +1079,20 @@ PostmasterMain(int argc, char *argv[])
Assert(StartupPID != 0);
pmState = PM_STARTUP;
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Initialize the Data Node connection pool
+ */
+ PgPoolerPID = StartPoolManager();
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+#endif
+
status = ServerLoop();
/*
@@ -1393,6 +1449,11 @@ ServerLoop(void)
if (PgStatPID == 0 && pmState == PM_RUN)
PgStatPID = pgstat_start();
+#ifdef PGXC /* PGXC_COORD */
+ /* If we have lost the pooler, try to start a new one */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID == 0 && pmState == PM_RUN)
+ PgPoolerPID = StartPoolManager();
+#endif
/*
* Touch the socket and lock file every 58 minutes, to ensure that
* they are not removed by overzealous /tmp-cleaning tasks. We assume
@@ -1990,6 +2051,10 @@ SIGHUP_handler(SIGNAL_ARGS)
SignalChildren(SIGHUP);
if (StartupPID != 0)
signal_child(StartupPID, SIGHUP);
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+ signal_child(PgPoolerPID, SIGHUP);
+#endif
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGHUP);
if (WalWriterPID != 0)
@@ -2062,6 +2127,11 @@ pmdie(SIGNAL_ARGS)
/* and the walwriter too */
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGTERM);
+#ifdef PGXC /* PGXC_COORD */
+ /* and the pool manager too */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+ signal_child(PgPoolerPID, SIGTERM);
+#endif
pmState = PM_WAIT_BACKUP;
}
@@ -2108,6 +2178,11 @@ pmdie(SIGNAL_ARGS)
/* and the walwriter too */
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGTERM);
+#ifdef PGXC /* PGXC_COORD */
+ /* and the pool manager too */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+ signal_child(PgPoolerPID, SIGTERM);
+#endif
pmState = PM_WAIT_BACKENDS;
}
@@ -2131,6 +2206,10 @@ pmdie(SIGNAL_ARGS)
SignalChildren(SIGQUIT);
if (StartupPID != 0)
signal_child(StartupPID, SIGQUIT);
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+ signal_child(PgPoolerPID, SIGQUIT);
+#endif
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGQUIT);
if (WalWriterPID != 0)
@@ -2266,6 +2345,10 @@ reaper(SIGNAL_ARGS)
PgArchPID = pgarch_start();
if (PgStatPID == 0)
PgStatPID = pgstat_start();
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR && PgPoolerPID == 0)
+ PgPoolerPID = StartPoolManager();
+#endif
/* at this point we are really open for business */
ereport(LOG,
@@ -2403,6 +2486,21 @@ reaper(SIGNAL_ARGS)
continue;
}
+#ifdef PGXC /* PGXC_COORD */
+ /*
+ * Was it the pool manager? TODO decide how to handle
+ * Probably we should restart the system
+ */
+ if (IS_PGXC_COORDINATOR && pid == PgPoolerPID)
+ {
+ PgPoolerPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ HandleChildCrash(pid, exitstatus,
+ _("pool manager process"));
+ continue;
+ }
+#endif
+
/*
* Else do standard backend child cleanup.
*/
@@ -2594,6 +2692,23 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
}
+#ifdef PGXC /* PGXC_COORD */
+ /* Take care of the pool manager too */
+ if (IS_PGXC_COORDINATOR)
+ {
+ if (pid == PgPoolerPID)
+ PgPoolerPID = 0;
+ else if (PgPoolerPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) PgPoolerPID)));
+ signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+ }
+#endif
+
/*
* Force a power-cycle of the pgarch process too. (This isn't absolutely
* necessary, but it seems like a good idea for robustness, and it
@@ -2724,6 +2839,9 @@ PostmasterStateMachine(void)
*/
if (CountChildren() == 0 &&
StartupPID == 0 &&
+#ifdef PGXC /* PGXC_COORD */
+ PgPoolerPID == 0 &&
+#endif
(BgWriterPID == 0 || !FatalError) &&
WalWriterPID == 0 &&
AutoVacPID == 0)
@@ -2798,6 +2916,9 @@ PostmasterStateMachine(void)
PgArchPID == 0 && PgStatPID == 0)
{
/* These other guys should be dead already */
+#ifdef PGXC /* PGXC_COORD */
+ Assert(PgPoolerPID == 0);
+#endif
Assert(StartupPID == 0);
Assert(BgWriterPID == 0);
Assert(WalWriterPID == 0);
@@ -2942,6 +3063,9 @@ BackendStartup(Port *port)
{
Backend *bn; /* for backend cleanup */
pid_t pid;
+#ifdef PGXC /* PGXC_COORD */
+ PoolHandle *pool_handle;
+#endif
/*
* Create backend data structure. Better before the fork() so we can
@@ -2977,12 +3101,31 @@ BackendStartup(Port *port)
else
bn->child_slot = 0;
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ pool_handle = GetPoolManagerHandle();
+ if (pool_handle == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_IO_ERROR),
+ errmsg("Can not connect to pool manager")));
+ return STATUS_ERROR;
+ }
+ }
+#endif
+
+
#ifdef EXEC_BACKEND
pid = backend_forkexec(port);
#else /* !EXEC_BACKEND */
pid = fork_process();
if (pid == 0) /* child */
{
+ //// FOR DEBUG
+ printf("The session started: %d\n", getpid());
+ //sleep(60);
+ //// FOR DEBUG
free(bn);
/*
@@ -3005,11 +3148,25 @@ BackendStartup(Port *port)
/* Perform additional initialization and client authentication */
BackendInitialize(port);
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* User is authenticated and dbname is known at this point */
+ PoolManagerConnect(pool_handle, port->database_name, GetAllNodes());
+ InitGTM();
+ }
+#endif
+
/* And run the backend */
proc_exit(BackendRun(port));
}
#endif /* EXEC_BACKEND */
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ PoolManagerCloseHandle(pool_handle);
+#endif
+
if (pid < 0)
{
/* in parent, fork failed */
@@ -4236,6 +4393,12 @@ StartChildProcess(AuxProcType type)
errno = save_errno;
switch (type)
{
+#ifdef PGXC /* PGXC_COORD */
+ case PoolerProcess:
+ ereport(LOG,
+ (errmsg("could not fork pool manager process: %m")));
+ break;
+#endif
case StartupProcess:
ereport(LOG,
(errmsg("could not fork startup process: %m")));
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e71b95c826..8ce8be820e 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -20,6 +20,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -38,6 +39,12 @@
#include "miscadmin.h"
#include "storage/procarray.h"
#include "utils/snapmgr.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_DATANODE */
+#include "postmaster/autovacuum.h"
+#endif
/* Our shared memory area */
@@ -90,6 +97,27 @@ static void DisplayXidCache(void);
#define xc_slow_answer_inc() ((void) 0)
#endif /* XIDCACHE_DEBUG */
+#ifdef PGXC /* PGXC_DATANODE */
+typedef enum
+{
+ SNAPSHOT_UNDEFINED, /* Coordinator has not sent snapshot or not yet connected */
+ SNAPSHOT_LOCAL, /* Coordinator has instructed data node to build up snapshot from the local procarray */
+ SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */
+ SNAPSHOT_DIRECT /* Data Node obtained directly from GTM */
+} SnapshotSource;
+
+void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip);
+void UnsetGlobalSnapshotData(void);
+static bool GetSnapshotDataDataNode(Snapshot snapshot);
+static bool GetSnapshotDataCoordinator(Snapshot snapshot);
+/* Global snapshot data */
+static SnapshotSource snapshot_source = SNAPSHOT_UNDEFINED;
+static int gxmin = InvalidTransactionId;
+static int gxmax = InvalidTransactionId;
+static int gxcnt = 0;
+static int *gxip = NULL;
+#endif
+
/*
* Report shared-memory space needed by CreateSharedProcArray.
@@ -682,6 +710,46 @@ GetSnapshotData(Snapshot snapshot)
int count = 0;
int subcount = 0;
+
+#ifdef PGXC /* PGXC_DATANODE */
+ /*
+ * The typical case is that the coordinator passes down the snapshot to the
+ * data nodes to use, while it itselfs obtains them from GTM.
+ * The data nodes may however connect directly to GTM themselves to obtain
+ * XID and snapshot information for autovacuum worker threads.
+ */
+ if (IS_PGXC_DATANODE)
+ {
+ if (GetSnapshotDataDataNode(snapshot))
+ return snapshot;
+ /* else fallthrough */
+ } else if (IS_PGXC_COORDINATOR)
+ {
+ if (GetSnapshotDataCoordinator(snapshot))
+ return snapshot;
+ /* else fallthrough */
+ }
+
+ /* If we have no snapshot, we will use a local one.
+ * If we are in normal mode, we output a warning though.
+ * We currently fallback and use a local one at initdb time,
+ * as well as when a new connection occurs.
+ * IsPostmasterEnvironment - checks for initdb
+ * IsNormalProcessingMode() - checks for new connections
+ */
+ if (IS_PGXC_DATANODE && snapshot_source == SNAPSHOT_UNDEFINED
+ && IsPostmasterEnvironment && IsNormalProcessingMode())
+ {
+ elog(WARNING, "Do not have a GTM snapshot available");
+ }
+#endif
+
+ /*
+ * Fallback to standard routine, calculate snapshot from local proc arrey
+ * if no master connection
+ */
+
+
Assert(snapshot != NULL);
/*
@@ -828,6 +896,9 @@ GetSnapshotData(Snapshot snapshot)
snapshot->curcid = GetCurrentCommandId(false);
+#ifdef PGXC
+ elog(DEBUG1, "Local snapshot is built, xmin: %d, xmax: %d, xcnt: %d, RecentGlobalXmin: %d", xmin, xmax, count, globalxmin);
+#endif
/*
* This is a new snapshot, so set both refcounts are zero, and mark it as
* not copied in persistent memory.
@@ -1400,3 +1471,262 @@ DisplayXidCache(void)
}
#endif /* XIDCACHE_DEBUG */
+
+
+#ifdef PGXC
+/*
+ * Store snapshot data received from the coordinator
+ */
+void
+SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip)
+{
+ snapshot_source = SNAPSHOT_COORDINATOR;
+ gxmin = xmin;
+ gxmax = xmax;
+ gxcnt = xcnt;
+ if (gxip)
+ free(gxip);
+ gxip = xip;
+ elog (DEBUG1, "global snapshot info: gxmin: %d, gxmax: %d, gxcnt: %d", gxmin, gxmax, gxcnt);
+}
+
+/*
+ * Force datanode to use local snapshot data
+ */
+void
+UnsetGlobalSnapshotData(void)
+{
+ snapshot_source = SNAPSHOT_UNDEFINED;
+ gxmin = InvalidTransactionId;
+ gxmax = InvalidTransactionId;
+ gxcnt = 0;
+ if (gxip)
+ free(gxip);
+ gxip = NULL;
+ elog (DEBUG1, "unset snapshot info");
+}
+
+/*
+ * Get snapshot data for data node
+ * This is usually passed down from the coordinator
+ *
+ * returns whether or not to return immediately with snapshot
+ */
+static bool
+GetSnapshotDataDataNode(Snapshot snapshot)
+{
+ Assert(IS_PGXC_DATANODE);
+
+
+ if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+ {
+ GTM_Snapshot gtm_snapshot;
+ bool canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable);
+ elog(DEBUG1, "Getting snapshot for autovacuum. Current XID = %d", GetCurrentTransactionId());
+ gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionId(), canbe_grouped);
+
+ if (!gtm_snapshot)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not obtain snapshot")));
+ else {
+ snapshot_source = SNAPSHOT_DIRECT;
+ gxmin = gtm_snapshot->sn_xmin;
+ gxmax = gtm_snapshot->sn_xmax;
+ gxcnt = gtm_snapshot->sn_xcnt;
+ RecentGlobalXmin = gtm_snapshot->sn_recent_global_xmin;
+ if (gxip)
+ free(gxip);
+ if (gxcnt > 0)
+ {
+ gxip = malloc(gxcnt * 4);
+ if (gxip == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * 4);
+ }
+ else
+ gxip = NULL;
+ elog(DEBUG1, "for autovacuum from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d",
+ gxmin, gxmax, gxcnt, RecentGlobalXmin);
+ }
+ }
+
+ if ((snapshot_source == SNAPSHOT_COORDINATOR || snapshot_source == SNAPSHOT_DIRECT)
+ && TransactionIdIsValid(gxmin))
+ {
+ snapshot->xmin = gxmin;
+ snapshot->xmax = gxmax;
+ snapshot->xcnt = gxcnt;
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * This does open a possibility for avoiding repeated malloc/free: since
+ * maxProcs does not change at runtime, we can simply reuse the previous
+ * xip arrays if any. (This relies on the fact that all callers pass
+ * static SnapshotData structs.) */
+ if (snapshot->xip == NULL)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ /*
+ * First call for this snapshot
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(arrayP->maxProcs * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ memcpy(snapshot->xip, gxip, gxcnt * sizeof(TransactionId));
+ snapshot->curcid = GetCurrentCommandId(false);
+
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = gxmin;
+
+ /*
+ * We should update RecentXmin here. But we have recently seen some
+ * issues with that - so skipping it for the time being.
+ *
+ * !!TODO
+ */
+ RecentXmin = gxmin;
+
+ /* PGXCTODO - set this until we handle subtransactions. */
+ snapshot->subxcnt = 0;
+
+ /*
+ * This is a new snapshot, so set both refcounts are zero, and mark it
+ * as not copied in persistent memory.
+ */
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Get snapshot data for coordinator
+ * It will later be passed down to data nodes
+ *
+ * returns whether or not to return immediately with snapshot
+ */
+static bool
+GetSnapshotDataCoordinator(Snapshot snapshot)
+{
+ bool canbe_grouped;
+ GTM_Snapshot gtm_snapshot;
+
+
+ Assert (IS_PGXC_COORDINATOR);
+
+ canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable);
+ gtm_snapshot = GetSnapshotGTM(GetCurrentGlobalTransactionId(), canbe_grouped);
+
+ if (!gtm_snapshot)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not obtain snapshot")));
+ else {
+ snapshot->xmin = gtm_snapshot->sn_xmin;
+ snapshot->xmax = gtm_snapshot->sn_xmax;
+ snapshot->recent_global_xmin = gtm_snapshot->sn_recent_global_xmin;
+ snapshot->xcnt = gtm_snapshot->sn_xcnt;
+ elog(DEBUG1, "from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d",
+ snapshot->xmin, snapshot->xmax, snapshot->xcnt, snapshot->recent_global_xmin);
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * This does open a possibility for avoiding repeated malloc/free: since
+ * maxProcs does not change at runtime, we can simply reuse the previous
+ * xip arrays if any. (This relies on the fact that all callers pass
+ * static SnapshotData structs.)
+ */
+ if (snapshot->xip == NULL)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ /*
+ * First call for this snapshot
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt) * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt);
+
+ /*
+ * FIXME
+ *
+ * We really don't support subtransaction in PGXC right now, but
+ * when we would, we should fix the allocation below
+ */
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ else if (snapshot->max_xcnt < gtm_snapshot->sn_xcnt)
+ {
+ snapshot->xip = (TransactionId *)
+ realloc(snapshot->xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = gtm_snapshot->sn_xcnt;
+ }
+
+ memcpy(snapshot->xip, gtm_snapshot->sn_xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId));
+ snapshot->curcid = GetCurrentCommandId(false);
+
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+ /*
+ * We should update RecentXmin here. But we have recently seen some
+ * issues with that - so skipping it for the time being.
+ *
+ * !!TODO
+ */
+
+ /* PGXCTODO - set this until we handle subtransactions. */
+ snapshot->subxcnt = 0;
+ /*
+ * This is a new snapshot, so set both refcounts are zero, and mark it
+ * as not copied in persistent memory.
+ */
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+ return true;
+ }
+ return false;
+}
+#endif /* PGXC */
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 43e912f5cf..34b63041d1 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -71,7 +72,16 @@
#include "utils/snapmgr.h"
#include "mb/pg_wchar.h"
-
+#ifdef PGXC
+#include "storage/procarray.h"
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_COORD */
+#include "pgxc/planner.h"
+#include "pgxc/datanode.h"
+/* PGXC_DATANODE */
+#include "access/transam.h"
+#endif
extern int optind;
extern char *optarg;
@@ -185,6 +195,27 @@ static void SigHupHandler(SIGNAL_ARGS);
static void log_disconnections(int code, Datum arg);
+#ifdef PGXC /* PGXC_DATANODE */
+static void pgxc_transaction_stmt (Node *parsetree);
+static List * pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord);
+
+/* ----------------------------------------------------------------
+ * PG-XC routines
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Called when the backend is ending.
+ */
+static void
+DataNodeShutdown (int code, Datum arg)
+{
+ /* Close connection with GTM, if active */
+ if (IsAutoVacuumWorkerProcess())
+ CloseGTM();
+}
+#endif
+
/* ----------------------------------------------------------------
* routines to obtain user input
* ----------------------------------------------------------------
@@ -398,6 +429,11 @@ SocketBackend(StringInfo inBuf)
(errcode(ERRCODE_PROTOCOL_VIOLATION),
errmsg("invalid frontend message type %d", qtype)));
break;
+#ifdef PGXC /* PGXC_DATANODE */
+ case 'g':
+ case 's':
+ break;
+#endif
default:
@@ -780,7 +816,6 @@ exec_simple_query(const char *query_string)
bool isTopLevel;
char msec_str[32];
-
/*
* Report query to various monitoring facilities.
*/
@@ -863,6 +898,22 @@ exec_simple_query(const char *query_string)
Portal portal;
DestReceiver *receiver;
int16 format;
+#ifdef PGXC
+ Query_Plan *query_plan;
+ Query_Step *query_step;
+ bool exec_on_coord;
+
+
+ /*
+ * By default we do not want data nodes to contact GTM directly,
+ * it should get this information passed down to it.
+ */
+ if (IS_PGXC_DATANODE)
+ SetForceXidFromGTM(false);
+
+ exec_on_coord = true;
+ query_plan = NULL;
+#endif
/*
* Get the command name for use in status display (it also becomes the
@@ -917,15 +968,53 @@ exec_simple_query(const char *query_string)
querytree_list = pg_analyze_and_rewrite(parsetree, query_string,
NULL, 0);
- plantree_list = pg_plan_queries(querytree_list, 0, NULL);
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ if (IsA(parsetree, TransactionStmt))
+ pgxc_transaction_stmt(parsetree);
+
+ else if (IsA(parsetree, ExecDirectStmt))
+ querytree_list = pgxc_execute_direct(parsetree, querytree_list, dest, snapshot_set, &exec_on_coord);
+
+ else
+ {
+ query_plan = GetQueryPlan(parsetree, query_string, querytree_list);
+
+ exec_on_coord = query_plan->exec_loc_type & EXEC_ON_COORD;
+ }
+
+ /* First execute on the coordinator, if involved (DDL), then data nodes */
+ }
+
+ if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+#endif
+ plantree_list = pg_plan_queries(querytree_list, 0, NULL);
/* Done with the snapshot used for parsing/planning */
+#ifdef PGXC
+ /* In PG-XC, hold on to it a bit longer */
+#else
if (snapshot_set)
PopActiveSnapshot();
+#endif
/* If we got a cancel signal in analysis or planning, quit */
CHECK_FOR_INTERRUPTS();
+#ifdef PGXC
+ /* PGXC_DATANODE */
+ /* Force getting Xid from GTM if not autovacuum, but a vacuum */
+ if (IS_PGXC_DATANODE && IsA(parsetree, VacuumStmt) && IsPostmasterEnvironment)
+ SetForceXidFromGTM(true);
+
+ /* PGXC_COORD */
+ /* Force getting Xid from GTM if not autovacuum, but a vacuum */
+ /* Skip the Portal stuff on coordinator if command only executes on data nodes */
+ if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+ {
+#endif
+
/*
* Create unnamed portal to run the query or queries in. If there
* already is one, silently drop it.
@@ -999,6 +1088,33 @@ exec_simple_query(const char *query_string)
PortalDrop(portal, false);
+#ifdef PGXC
+ }
+
+ /* PGXC_COORD */
+ /* If the coordinator ran ok, now run on the data nodes if planned */
+ if (IS_PGXC_COORDINATOR)
+ {
+ if (query_plan && (query_plan->exec_loc_type & EXEC_ON_DATA_NODES))
+ {
+ query_step = linitial(query_plan->query_step_list);
+
+ DataNodeExec(query_step->sql_statement,
+ query_step->nodelist,
+ dest,
+ snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(),
+ query_plan->force_autocommit,
+ query_step->simple_aggregates,
+ IsA(parsetree, SelectStmt));
+ }
+
+ FreeQueryPlan(query_plan);
+ }
+
+ if (snapshot_set)
+ PopActiveSnapshot();
+#endif /* PGXC_COORD */
+
if (IsA(parsetree, TransactionStmt))
{
/*
@@ -1029,6 +1145,11 @@ exec_simple_query(const char *query_string)
*/
CommandCounterIncrement();
}
+#ifdef PGXC /* PGXC_COORD */
+ /* In case of PGXC handling client already received a response */
+ if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+ {
+#endif
/*
* Tell client that we're done with this query. Note we emit exactly
@@ -1037,6 +1158,9 @@ exec_simple_query(const char *query_string)
* aborted by error will not send an EndCommand report at all.)
*/
EndCommand(completionTag, dest);
+#ifdef PGXC /* PGXC_COORD */
+ }
+#endif
} /* end loop over parsetrees */
/*
@@ -2868,6 +2992,14 @@ PostgresMain(int argc, char *argv[], const char *username)
sigjmp_buf local_sigjmp_buf;
volatile bool send_ready_for_query = true;
+#ifdef PGXC /* PGXC_DATANODE */
+ /* Snapshot info */
+ int xmin;
+ int xmax;
+ int xcnt;
+ int *xip;
+#endif
+
#define PendingConfigOption(name,val) \
(guc_names = lappend(guc_names, pstrdup(name)), \
guc_values = lappend(guc_values, pstrdup(val)))
@@ -2948,7 +3080,11 @@ PostgresMain(int argc, char *argv[], const char *username)
* postmaster/postmaster.c (the option sets should not conflict) and with
* the common help() function in main/main.c.
*/
+#ifdef PGXC
+ while ((flag = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:Xy:-:")) != -1)
+#else
while ((flag = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:y:-:")) != -1)
+#endif
{
switch (flag)
{
@@ -2960,6 +3096,12 @@ PostgresMain(int argc, char *argv[], const char *username)
SetConfigOption("shared_buffers", optarg, ctx, gucsource);
break;
+#ifdef PGXC
+ case 'C':
+ isPGXCCoordinator = true;
+ break;
+#endif
+
case 'D':
if (secure)
userDoption = optarg;
@@ -3082,7 +3224,11 @@ PostgresMain(int argc, char *argv[], const char *username)
SetConfigOption("post_auth_delay", optarg, ctx, gucsource);
break;
-
+#ifdef PGXC
+ case 'X':
+ isPGXCDataNode = true;
+ break;
+#endif
case 'y':
/*
@@ -3140,6 +3286,24 @@ PostgresMain(int argc, char *argv[], const char *username)
}
}
+#ifdef PGXC
+ /*
+ * Make sure we specified the mode if Coordinator or Data Node.
+ * Allow for the exception of initdb by checking config option
+ */
+ if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE && IsUnderPostmaster)
+ {
+ ereport(FATAL,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n")));
+ }
+ if (!IsPostmasterEnvironment)
+ {
+ /* Treat it as a data node for initdb to work properly */
+ isPGXCDataNode = true;
+ }
+#endif
+
/*
* Process any additional GUC variable settings passed in startup packet.
* These are handled exactly like command-line variables.
@@ -3511,6 +3675,19 @@ PostgresMain(int argc, char *argv[], const char *username)
if (!ignore_till_sync)
send_ready_for_query = true; /* initially, or after error */
+#ifdef PGXC /* PGXC_COORD */
+ if (IS_PGXC_COORDINATOR)
+ {
+ InitMultinodeExecutor();
+ /* If we exit, first try and clean connections and send to pool */
+ on_proc_exit (DataNodeCleanAndRelease, 0);
+ }
+ if (IS_PGXC_DATANODE)
+ {
+ /* If we exit, first try and clean connection to GTM */
+ on_proc_exit (DataNodeShutdown, 0);
+ }
+#endif
/*
* Non-error queries loop here.
*/
@@ -3560,6 +3737,15 @@ PostgresMain(int argc, char *argv[], const char *username)
}
ReadyForQuery(whereToSendOutput);
+#ifdef PGXC
+ /*
+ * Helps us catch any problems where we did not send down a snapshot
+ * when it was expected.
+ */
+ if (IS_PGXC_DATANODE)
+ UnsetGlobalSnapshotData();
+#endif
+
send_ready_for_query = false;
}
@@ -3832,6 +4018,42 @@ PostgresMain(int argc, char *argv[], const char *username)
* is still sending data.
*/
break;
+#ifdef PGXC /* PGXC_DATANODE */
+ case 'g': /* gxid */
+ {
+ /* Set the GXID we were passed down */
+ TransactionId gxid = (TransactionId) pq_getmsgint(&input_message, 4);
+ elog(DEBUG1, "Received new gxid %u", gxid);
+ SetNextTransactionId(gxid);
+ pq_getmsgend(&input_message);
+ }
+ break;
+
+ case 's': /* snapshot */
+ /* Set the snapshot we were passed down */
+ xmin = pq_getmsgint(&input_message, 4);
+ xmax = pq_getmsgint(&input_message, 4);
+ RecentGlobalXmin = pq_getmsgint(&input_message, 4);
+ xcnt = pq_getmsgint(&input_message, 4);
+ if (xcnt > 0)
+ {
+ int i;
+ xip = malloc(xcnt * 4);
+ if (xip == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ for (i = 0; i < xcnt; i++)
+ xip[i] = pq_getmsgint(&input_message, 4);
+ }
+ else
+ xip = NULL;
+ pq_getmsgend(&input_message);
+ SetGlobalSnapshotData(xmin, xmax, xcnt, xip);
+ break;
+#endif /* PGXC */
default:
ereport(FATAL,
@@ -4023,3 +4245,117 @@ log_disconnections(int code, Datum arg)
port->user_name, port->database_name, port->remote_host,
port->remote_port[0] ? " port=" : "", port->remote_port)));
}
+
+
+#ifdef PGXC
+/*
+ * Handle transaction statements in PG-XC
+ */
+void
+pgxc_transaction_stmt (Node *parsetree)
+{
+ Assert(IS_PGXC_COORDINATOR);
+
+
+ /* Handle transaction statements specially */
+ if (IsA(parsetree, TransactionStmt))
+ {
+ TransactionStmt *stmt = (TransactionStmt *) parsetree;
+
+ switch (stmt->kind)
+ {
+ case TRANS_STMT_BEGIN:
+ /*
+ * This does not yet send down a BEGIN,
+ * we do that "on demand" as data nodes are added
+ */
+ DataNodeBegin();
+ break;
+
+ case TRANS_STMT_COMMIT:
+ DataNodeCommit(DestNone);
+ break;
+
+ case TRANS_STMT_ROLLBACK:
+ DataNodeRollback(DestNone);
+ break;
+
+ default:
+ /* Ignore others for prototype */
+ break;
+ }
+ }
+}
+
+
+/*
+ * Handle EXECUTE DIRECT
+ */
+List *
+pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord)
+{
+ List *node_list = NIL;
+ List *parsetree_list;
+ ListCell *node_cell;
+ ExecDirectStmt *execdirect = (ExecDirectStmt *) parsetree;
+ bool on_coord = execdirect->coordinator;
+
+
+ Assert(IS_PGXC_COORDINATOR);
+ Assert(IsA(parsetree, ExecDirectStmt));
+
+ foreach (node_cell, execdirect->nodes)
+ {
+ int node_int = intVal(lfirst(node_cell));
+ node_list = lappend_int(node_list, node_int);
+ }
+ if (node_list)
+ if (DataNodeExec(execdirect->query,
+ node_list,
+ dest,
+ snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(),
+ FALSE,
+ FALSE,
+ FALSE) != 0)
+ on_coord = false;
+
+ if (on_coord)
+ {
+ /*
+ * Parse inner statement, like at the begiining of the function
+ * We do not have to release wrapper trees, the message context
+ * will be deleted later
+ * Also, no need to switch context - current is already
+ * the MessageContext
+ */
+ parsetree_list = pg_parse_query(execdirect->query);
+
+ /* We do not want to log or display the inner command */
+
+ /*
+ * we do not support complex commands (expanded to multiple
+ * parse trees) within EXEC DIRECT
+ */
+ if (list_length(parsetree_list) != 1)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Can not execute %s with EXECUTE DIRECT",
+ execdirect->query)));
+ }
+
+ /*
+ * Get parse tree from the list
+ */
+ parsetree = (Node *) lfirst(list_head(parsetree_list));
+
+ /*
+ * Build new query tree */
+ querytree_list = pg_analyze_and_rewrite(parsetree,
+ execdirect->query, NULL, 0);
+ }
+ *exec_on_coord = on_coord;
+
+ return querytree_list;
+}
+#endif /* PGXC */
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index f51f90f86b..28041c6305 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2069,6 +2069,10 @@ CreateCommandTag(Node *parsetree)
}
}
break;
+
+ case T_ExecDirectStmt:
+ tag = "EXECUTE DIRECT";
+ break;
default:
elog(WARNING, "unrecognized node type: %d",
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 775865d569..47ee10e682 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -33,6 +34,9 @@
#include "access/heapam.h"
#include "access/reloptions.h"
#include "access/sysattr.h"
+#ifdef PGXC
+#include "access/transam.h"
+#endif
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/index.h"
@@ -54,6 +58,9 @@
#include "optimizer/planmain.h"
#include "optimizer/prep.h"
#include "optimizer/var.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
#include "rewrite/rewriteDefine.h"
#include "storage/fd.h"
#include "storage/lmgr.h"
@@ -856,6 +863,10 @@ RelationBuildDesc(Oid targetRelId, Relation oldrelation)
else
relation->trigdesc = NULL;
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR && relation->rd_id >= FirstNormalObjectId)
+ RelationBuildLocator(relation);
+#endif
/*
* if it's an index, initialize index-related information
*/
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 922c4a626f..5b70df1924 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
@@ -49,6 +50,9 @@
#include "catalog/pg_ts_template.h"
#include "catalog/pg_type.h"
#include "catalog/pg_user_mapping.h"
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#endif
#include "utils/rel.h"
#include "utils/syscache.h"
@@ -524,6 +528,20 @@ static const struct cachedesc cacheinfo[] = {
},
64
},
+#ifdef PGXC
+ {PgxcClassRelationId, /* PGXCCLASSRELID */
+ PgxcClassPgxcRelIdIndexId,
+ Anum_pgxc_class_pcrelid,
+ 1,
+ {
+ ObjectIdAttributeNumber,
+ 0,
+ 0,
+ 0
+ },
+ 1024
+ },
+#endif
{ProcedureRelationId, /* PROCNAMEARGSNSP */
ProcedureNameArgsNspIndexId,
0,
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 7bdfb67204..7063f6f5f6 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -50,7 +50,6 @@ ProcessingMode Mode = InitProcessing;
/* Note: we rely on this to initialize as zeroes */
static char socketLockFile[MAXPGPATH];
-
/* ----------------------------------------------------------------
* ignoring system indexes support stuff
*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 210bd6ba6a..c9f0a63418 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -7,6 +7,7 @@
*
*
* Copyright (c) 2000-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
* Written by Peter Eisentraut <[email protected]>.
*
* IDENTIFICATION
@@ -27,6 +28,9 @@
#endif
#include "access/gin.h"
+#ifdef PGXC
+#include "access/gtm.h"
+#endif
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
@@ -50,6 +54,11 @@
#include "parser/parse_type.h"
#include "parser/scansup.h"
#include "pgstat.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#include "pgxc/planner.h"
+#include "pgxc/poolmgr.h"
+#endif
#include "postmaster/autovacuum.h"
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
@@ -532,6 +541,12 @@ const char *const config_group_names[] =
gettext_noop("Customized Options"),
/* DEVELOPER_OPTIONS */
gettext_noop("Developer Options"),
+#ifdef PGXC
+ /* DATA_NODES */
+ gettext_noop("Data Nodes and Connection Pooling"),
+ /* GTM */
+ gettext_noop("GTM Connection"),
+#endif
/* help_config wants this array to be null-terminated */
NULL
};
@@ -1220,7 +1235,38 @@ static struct config_bool ConfigureNamesBool[] =
&IgnoreSystemIndexes,
false, NULL, NULL
},
-
+#ifdef PGXC
+ {
+ {"persistent_datanode_connections", PGC_BACKEND, DEVELOPER_OPTIONS,
+ gettext_noop("Session never releases acquired connections."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &PersistentConnections,
+ false, NULL, NULL
+ },
+ {
+ {"strict_statement_checking", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Forbid statements that are not safe for the cluster"),
+ NULL
+ },
+ &StrictStatementChecking,
+ true, NULL, NULL
+ },
+ {
+ /*
+ * This is temporary work-around until we allow for a merge-sort of
+ * ORDER BY.
+ */
+ {"strict_select_checking", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Forbid if SELECT has ORDER BY"),
+ gettext_noop("and is not safe for the cluster"),
+ GUC_NOT_IN_SAMPLE
+ },
+ &StrictSelectChecking,
+ false, NULL, NULL
+ },
+#endif
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL
@@ -1255,7 +1301,7 @@ static struct config_int ConfigureNamesInt[] =
gettext_noop("This applies to table columns that have not had a "
"column-specific target set via ALTER TABLE SET STATISTICS.")
},
- &default_statistics_target,
+ &default_statistics_target,
100, 1, 10000, NULL, NULL
},
{
@@ -1504,7 +1550,11 @@ static struct config_int ConfigureNamesInt[] =
NULL
},
&max_prepared_xacts,
+#ifdef PGXC
+ 10, 0, INT_MAX / 4, NULL, NULL
+#else
0, 0, INT_MAX / 4, NULL, NULL
+#endif
},
#ifdef LOCK_DEBUG
@@ -1951,7 +2001,63 @@ static struct config_int ConfigureNamesInt[] =
&pgstat_track_activity_query_size,
1024, 100, 102400, NULL, NULL
},
+#ifdef PGXC
+ {
+ {"num_data_nodes", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Number of data nodes."),
+ NULL
+ },
+ &NumDataNodes,
+ 2, 1, 65535, NULL, NULL
+ },
+ {
+ {"min_pool_size", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Initial pool size."),
+ gettext_noop("If number of active connections decreased below this value, "
+ "new connections are established")
+ },
+ &MinPoolSize,
+ 1, 1, 65535, NULL, NULL
+ },
+
+ {
+ {"max_pool_size", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Max pool size."),
+ gettext_noop("If number of active connections reaches this value, "
+ "other connection requests will be refused")
+ },
+ &MaxPoolSize,
+ 100, 1, 65535, NULL, NULL
+ },
+
+ {
+ {"pooler_port", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Port of the Pool Manager."),
+ NULL
+ },
+ &PoolerPort,
+ 6667, 1, 65535, NULL, NULL
+ },
+
+ {
+ {"gtm_port", PGC_POSTMASTER, GTM,
+ gettext_noop("Port of GTM."),
+ NULL
+ },
+ &GtmPort,
+ 6666, 1, 65535, NULL, NULL
+ },
+
+ {
+ {"gtm_coordinator_id", PGC_POSTMASTER, GTM,
+ gettext_noop("The Coordinator Identifier."),
+ NULL
+ },
+ &GtmCoordinatorId,
+ 1, 1, INT_MAX, NULL, NULL
+ },
+#endif
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL
@@ -2502,6 +2608,65 @@ static struct config_string ConfigureNamesString[] =
"pg_catalog.simple", assignTSCurrentConfig, NULL
},
+#ifdef PGXC
+ {
+ {"preferred_data_nodes", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Preferred data nodes."),
+ gettext_noop("A list of data nodes to read from replicated tables")
+ },
+ &PreferredDataNodes,
+ "", NULL, NULL
+ },
+
+ {
+ {"data_node_hosts", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Host names or addresses of data nodes."),
+ gettext_noop("Comma separated list or single value, "
+ "if all data nodes on the same host")
+ },
+ &DataNodeHosts,
+ "localhost", NULL, NULL
+ },
+
+ {
+ {"data_node_ports", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Port numbers of data nodes."),
+ gettext_noop("Comma separated list or single value, "
+ "if all data nodes listen on the same port")
+ },
+ &DataNodePorts,
+ "15432,25432", NULL, NULL
+ },
+
+ {
+ {"data_node_users", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("User names or addresses of data nodes."),
+ gettext_noop("Comma separated list or single value, "
+ "if user names are the same on all data nodes")
+ },
+ &DataNodeUsers,
+ "postgres", NULL, NULL
+ },
+
+ {
+ {"data_node_passwords", PGC_POSTMASTER, DATA_NODES,
+ gettext_noop("Passwords of data nodes."),
+ gettext_noop("Comma separated list or single value, "
+ "if passwords are the same on all data nodes")
+ },
+ &DataNodePwds,
+ "postgres", NULL, NULL
+ },
+
+ {
+ {"gtm_host", PGC_POSTMASTER, GTM,
+ gettext_noop("Host name or address of GTM"),
+ NULL
+ },
+ &GtmHost,
+ "localhost", NULL, NULL
+ },
+#endif
#ifdef USE_SSL
{
{"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3f7b43f0cc..e46670cd91 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -106,7 +106,7 @@
#shared_buffers = 32MB # min 128kB
# (change requires restart)
#temp_buffers = 8MB # min 800kB
-#max_prepared_transactions = 0 # zero disables the feature
+#max_prepared_transactions = 10 # zero disables the feature
# (change requires restart)
# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory
# per transaction slot, plus lock space (see max_locks_per_transaction).
@@ -490,9 +490,61 @@
#transform_null_equals = off
+#------------------------------------------------------------------------------
+# DATA NODES AND CONNECTION POOLING
+#------------------------------------------------------------------------------
+
+#pooler_port = 6667 # Pool Manager TCP port
+ # (change requires restart)
+#num_data_nodes = 2 # Number of Data Nodes
+ # (change requires restart)
+#preferred_data_nodes = '' # List of preferred Data Nodes to read from
+ # replicated tables. If empty use all the data nodes
+ # (change requires restart)
+#min_pool_size = 1 # Initial pool size
+ # (change requires restart)
+#max_pool_size = 100 # Maximum pool size
+ # (change requires restart)
+#persistent_datanode_connections = off # Set persistent connection mode for pooler
+ # if set at on, connections taken for coordinator
+ # are not put back to pool
+#data_node_hosts = 'localhost' # Host names or addresses of data nodes
+ # (change requires restart)
+#data_node_ports = '15432,25432' # Port numbers of data nodes
+ # (change requires restart)
+#data_node_users = 'postgres' # User names of data nodes
+ # (change requires restart)
+#data_node_passwords = 'postgres' # Passwords of data nodes
+ # (change requires restart)
+# Note each adata_node_... value should be either a single value if respective
+# parameter is the same on all nodes or a comma-separated list, with number of
+# entries not less then number of nodes end each entry is a value for node with
+# respective number between 1 and num_data_nodes. If list is longer then
+# num_data_nodes extra values are ignored.
#------------------------------------------------------------------------------
+# GTM CONNECTION
+#------------------------------------------------------------------------------
+
+#gtm_host = 'localhost' # Host name or address of GTM
+ # (change requires restart)
+#gtm_port = 6666 # Port of GTM
+ # (change requires restart)
+#gtm_coordinator_id = 1 # Coordinator identifier
+ # (change requires restart)
+
+##------------------------------------------------------------------------------
+# OTHER PG-XC OPTIONS
+#------------------------------------------------------------------------------
+#strict_statement_checking = on # Forbid PG-XC-unsafe SQL
+ # Enabling is useful for development
+#strict_select_checking = off # Temporary; be strict about allowing
+ # multi-node ORDER BY
+
+
+##------------------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#------------------------------------------------------------------------------
#custom_variable_classes = '' # list of custom variable class names
+
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 3fc9b3880e..0677b09660 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -40,6 +40,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
* Portions taken from FreeBSD.
*
* $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.172 2009/06/11 14:49:07 momjian Exp $
@@ -62,6 +63,7 @@
#include "getopt_long.h"
#include "miscadmin.h"
+#include "postgres.h"
/*
* these values are passed in by makefile defines
@@ -3179,14 +3181,34 @@ main(int argc, char *argv[])
strcpy(bin_dir, argv[0]);
get_parent_directory(bin_dir);
- printf(_("\nSuccess. You can now start the database server using:\n\n"
- " %s%s%spostgres%s -D %s%s%s\n"
+
+#ifdef PGXC
+ printf(_("\nSuccess.\n You can now start the database server of the Postgres-XC coordinator using:\n\n"
+ " %s%s%spostgres%s -C -D %s%s%s\n"
"or\n"
- " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
+ " %s%s%spg_ctl%s start -D %s%s%s -S coordinator -l logfile\n\n"
+ " You can now start the database server of the Postgres-XC datanode using:\n\n"
+ " %s%s%spostgres%s -X -D %s%s%s\n"
+ "or \n"
+ " %s%s%spg_ctl%s start -D %s%s%s -S datanode -l logfile\n\n"),
+ QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+ QUOTE_PATH, pg_data_native, QUOTE_PATH,
+ QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+ QUOTE_PATH, pg_data_native, QUOTE_PATH,
QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
QUOTE_PATH, pg_data_native, QUOTE_PATH,
QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
QUOTE_PATH, pg_data_native, QUOTE_PATH);
+#else
+ printf(_("\nSuccess. You can now start the database server of datanode using:\n\n"
+ " %s%s%spostgres%s -D %s%s%s\n"
+ "or\n"
+ " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
+ QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+ QUOTE_PATH, pg_data_native, QUOTE_PATH,
+ QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+ QUOTE_PATH, pg_data_native, QUOTE_PATH);
+#endif
return 0;
}
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 40ede2c1a8..3e06bd4132 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -3,6 +3,7 @@
* pg_ctl --- start/stops/restarts the PostgreSQL server
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/bin/pg_ctl/pg_ctl.c,v 1.111 2009/06/11 14:49:07 momjian Exp $
*
@@ -58,8 +59,8 @@ typedef enum
{
NO_COMMAND = 0,
START_COMMAND,
- STOP_COMMAND,
RESTART_COMMAND,
+ STOP_COMMAND,
RELOAD_COMMAND,
STATUS_COMMAND,
KILL_COMMAND,
@@ -88,6 +89,9 @@ static char *register_username = NULL;
static char *register_password = NULL;
static char *argv0 = NULL;
static bool allow_core_files = false;
+#ifdef PGXC
+static char *pgxcCommand = NULL;
+#endif
static void
write_stderr(const char *fmt,...)
@@ -357,12 +361,23 @@ start_postmaster(void)
* everything to a shell to process them.
*/
if (log_file != NULL)
+#ifdef PGXC
+ snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
+ postgres_path, pgxcCommand, pgdata_opt, post_opts,
+ DEVNULL, log_file);
+#else
snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
postgres_path, pgdata_opt, post_opts,
DEVNULL, log_file);
+#endif
else
+#ifdef PGXC
+ snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
+ postgres_path, pgxcCommand, pgdata_opt, post_opts, DEVNULL);
+#else
snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
postgres_path, pgdata_opt, post_opts, DEVNULL);
+#endif
return system(cmd);
#else /* WIN32 */
@@ -1520,16 +1535,22 @@ do_help(void)
printf(_("%s is a utility to start, stop, restart, reload configuration files,\n"
"report the status of a PostgreSQL server, or signal a PostgreSQL process.\n\n"), progname);
printf(_("Usage:\n"));
+#ifdef PGXC
+ printf(_(" %s start [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
+ printf(_(" %s restart [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
+ " [-o \"OPTIONS\"]\n"), progname);
+#else
printf(_(" %s start [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
- printf(_(" %s stop [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
printf(_(" %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
- " [-o \"OPTIONS\"]\n"), progname);
+ " [-o \"OPTIONS\"]\n"), progname);
+#endif
+ printf(_(" %s stop [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
printf(_(" %s status [-D DATADIR]\n"), progname);
printf(_(" %s kill SIGNALNAME PID\n"), progname);
#if defined(WIN32) || defined(__CYGWIN__)
printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
- " [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
+ " [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
printf(_(" %s unregister [-N SERVICENAME]\n"), progname);
#endif
@@ -1537,6 +1558,9 @@ do_help(void)
printf(_(" -D, --pgdata DATADIR location of the database storage area\n"));
printf(_(" -s, --silent only print errors, no informational messages\n"));
printf(_(" -t SECS seconds to wait when using -w option\n"));
+#ifdef PGXC
+ printf(_(" -S NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XC)\n"));
+#endif
printf(_(" -w wait until operation completes\n"));
printf(_(" -W do not wait until operation completes\n"));
printf(_(" --help show this help, then exit\n"));
@@ -1715,7 +1739,11 @@ main(int argc, char **argv)
/* process command-line options */
while (optind < argc)
{
+#ifdef PGXC
+ while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:S:st:U:wW", long_options, &option_index)) != -1)
+#else
while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:st:U:wW", long_options, &option_index)) != -1)
+#endif
{
switch (c)
{
@@ -1759,6 +1787,13 @@ main(int argc, char **argv)
case 'P':
register_password = xstrdup(optarg);
break;
+#ifdef PGXC
+ case 'S':
+ if (strcmp(optarg, "coordinator") == 0)
+ pgxcCommand = strdup("-C");
+ else if (strcmp(optarg, "datanode") == 0)
+ pgxcCommand = strdup("-X");
+#endif
case 's':
silent_mode = true;
break;
@@ -1808,13 +1843,12 @@ main(int argc, char **argv)
do_advice();
exit(1);
}
-
if (strcmp(argv[optind], "start") == 0)
ctl_command = START_COMMAND;
- else if (strcmp(argv[optind], "stop") == 0)
- ctl_command = STOP_COMMAND;
else if (strcmp(argv[optind], "restart") == 0)
ctl_command = RESTART_COMMAND;
+ else if (strcmp(argv[optind], "stop") == 0)
+ ctl_command = STOP_COMMAND;
else if (strcmp(argv[optind], "reload") == 0)
ctl_command = RELOAD_COMMAND;
else if (strcmp(argv[optind], "status") == 0)
@@ -1856,6 +1890,18 @@ main(int argc, char **argv)
exit(1);
}
+#ifdef PGXC
+ /* stop command does not need to have coordinator or datanode options */
+ if ((ctl_command == START_COMMAND || ctl_command == RESTART_COMMAND)
+ && !pgxcCommand)
+ {
+ write_stderr(_("%s: coordinator or datanode option not specified (-S)\n"),
+ progname);
+ do_advice();
+ exit(1);
+ }
+#endif
+
/* Note we put any -D switch into the env var above */
pg_data = getenv("PGDATA");
if (pg_data)
@@ -1912,12 +1958,12 @@ main(int argc, char **argv)
case START_COMMAND:
do_start();
break;
- case STOP_COMMAND:
- do_stop();
- break;
case RESTART_COMMAND:
do_restart();
break;
+ case STOP_COMMAND:
+ do_stop();
+ break;
case RELOAD_COMMAND:
do_reload();
break;
diff --git a/src/gtm/Makefile b/src/gtm/Makefile
new file mode 100644
index 0000000000..51c55e0dd5
--- /dev/null
+++ b/src/gtm/Makefile
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/gtm
+# GTM and GTM proxy
+#
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#-------------------------------------------------------------------------
+
+PGFILEDESC = "gtm - Global Transaction Manager for Postgres-XC"
+subdir = src/gtm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+WANTED_DIRS=common path libpq client main proxy gtm_ctl
+
+all:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+clobber:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+clean:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+distclean: clean
+
+maintainer-clean: distclean
+
+install: all
+ $(INSTALL_PROGRAM) ./main/gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)'
+ $(INSTALL_PROGRAM) ./gtm_ctl/gtm_ctl$(X) '$(DESTDIR)$(bindir)/gtm_ctl$(X)'
+ $(INSTALL_PROGRAM) ./proxy/gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)'
+
+uninstall:
+ rm -f $(DESTDIR)$(bindir)/gtm$(X)
+ rm -f $(DESTDIR)$(bindir)/gtm_ctl$(X)
+ rm -f $(DESTDIR)$(bindir)/gtm_proxy$(X)
diff --git a/src/gtm/Makefile.global b/src/gtm/Makefile.global
new file mode 100644
index 0000000000..f130bdbd7f
--- /dev/null
+++ b/src/gtm/Makefile.global
@@ -0,0 +1,116 @@
+
+##########################################################################
+#
+# Meta configuration
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+.PHONY: all install install-strip installdirs uninstall clean distclean check installcheck
+.SILENT: installdirs
+
+# make `all' the default target
+all:
+
+# Delete target files if the command fails after it has
+# started to update the file.
+.DELETE_ON_ERROR:
+
+# PostgreSQL version number
+VERSION = 1.0Beta
+MAJORVERSION = 1.0
+
+top_srcdir=$(top_build_dir)
+
+enable_shared = yes
+
+##########################################################################
+#
+# Programs and flags
+
+# Compilers
+
+CPP = gcc -E
+CPPFLAGS = -D_GNU_SOURCE
+
+override CPPFLAGS := -I$(top_srcdir)/include $(CPPFLAGS)
+
+CC = gcc
+GCC = yes
+SUN_STUDIO_CC = no
+CFLAGS = $(DEBUGFLAGS) -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -fno-strict-aliasing -fwrapv
+
+# Kind-of compilers
+
+BISON = bison
+BISONFLAGS = $(YFLAGS)
+FLEX = /usr/bin/flex
+FLEXFLAGS = $(LFLAGS)
+DTRACE =
+DTRACEFLAGS =
+ZIC =
+
+# Linking
+
+AR = ar
+DLLTOOL =
+DLLWRAP =
+LIBS = -lz -lreadline -lcrypt -ldl -lm -lpthread
+LDAP_LIBS_FE =
+LDAP_LIBS_BE =
+OSSP_UUID_LIBS =
+LD = /usr/bin/ld
+with_gnu_ld = yes
+ld_R_works =
+LDFLAGS = -Wl,--as-needed
+LDFLAGS_SL =
+LDREL = -r
+LDOUT = -o
+RANLIB = ranlib
+WINDRES =
+X =
+
+# Perl
+
+# quoted for pathname with spaces
+PERL = "/usr/bin/perl"
+perl_archlibexp =
+perl_privlibexp =
+perl_useshrplib =
+perl_embed_ldflags =
+
+# Miscellaneous
+
+AWK = gawk
+LN_S = ln -s
+MSGFMT =
+MSGMERGE =
+PYTHON =
+TAR = /bin/tar
+XGETTEXT =
+
+GZIP = gzip
+BZIP2 = bzip2
+
+PL_TESTDB = pl_regression
+CONTRIB_TESTDB = contrib_regression
+
+
+
+##########################################################################
+#
+# Additional platform-specific settings
+#
+
+# Name of the "template"
+PORTNAME= linux
+
+
+# Set up rpath if enabled. By default it will point to our libdir,
+# but individual Makefiles can force other rpath paths if needed.
+rpathdir = $(libdir)
+
+ifeq ($(enable_rpath), yes)
+LDFLAGS += $(rpath)
+endif
+
+include $(top_build_dir)/gtm/Makefile.port
+
diff --git a/src/gtm/Makefile.port b/src/gtm/Makefile.port
new file mode 100644
index 0000000000..611c8b7766
--- /dev/null
+++ b/src/gtm/Makefile.port
@@ -0,0 +1,16 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+AROPT = crs
+export_dynamic = -Wl,-E
+rpath = -Wl,-rpath,'$(rpathdir)'
+allow_nonpic_in_shlib = yes
+DLSUFFIX = .so
+
+ifeq "$(findstring sparc,$(host_cpu))" "sparc"
+CFLAGS_SL = -fPIC
+else
+CFLAGS_SL = -fpic
+endif
+
+%.so: %.o
+ $(CC) $(CFLAGS) -shared -o $@ $<
diff --git a/src/gtm/Makefile.shlib b/src/gtm/Makefile.shlib
new file mode 100644
index 0000000000..83aca3896b
--- /dev/null
+++ b/src/gtm/Makefile.shlib
@@ -0,0 +1,556 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.shlib
+# Common rules for building shared libraries
+#
+# Copyright (c) 1998, Regents of the University of California
+# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+# $PostgreSQL: pgsql/src/Makefile.shlib,v 1.119 2008/12/11 07:34:07 petere Exp $
+#
+#-------------------------------------------------------------------------
+
+# This file should be included by any Postgres module Makefile that
+# wants to build a shared library (if possible for the current
+# platform). A static library is also built from the same object
+# files. Only one library can be built per makefile.
+#
+# Before including this file, the module Makefile must define these
+# variables:
+#
+# NAME Name of library to build (no suffix nor "lib" prefix)
+# OBJS List of object files to include in library
+# SHLIB_LINK If shared library relies on other libraries,
+# additional stuff to put in its link command
+# SHLIB_EXPORTS (optional) Name of file containing list of symbols to
+# export
+#
+# When building a shared library, the following version information
+# must also be set. It should be omitted when building a dynamically
+# loadable module.
+#
+# SO_MAJOR_VERSION Major version number to use for shared library
+# SO_MINOR_VERSION Minor version number to use for shared library
+# (If you want a patchlevel, include it in SO_MINOR_VERSION, e.g., "6.2".)
+#
+# Optional flags when building DLL's (only applicable to win32 and cygwin
+# platforms).
+# DLLTOOL_DEFFLAGS Additional flags when creating the dll .def file
+# DLLTOOL_LIBFLAGS Additional flags when creating the lib<module>.a file
+# DLLWRAP_FLAGS Additional flags to dllwrap
+#
+# The module Makefile must also include
+# $(top_builddir)/src/Makefile.global before including this file.
+# (Makefile.global sets PORTNAME and other needed symbols.)
+#
+# This makefile provides the following (phony) targets:
+#
+# all-lib build the static and shared (if applicable) libraries
+# install-lib install the libraries into $(libdir)
+# installdirs-lib create installation directory $(libdir)
+# uninstall-lib remove the libraries from $(libdir)
+# clean-lib delete the static and shared libraries from the build dir
+# maintainer-clean-lib delete .def files built for win32
+#
+# Since `all-lib' is the first rule in this file you probably want to
+# have the `all' target before including this file. In the most simple
+# case it would look like this:
+#
+# all: all-lib
+#
+# Similarly, the install rule might look like
+#
+# install: install-lib
+#
+# plus any additional things you want to install. Et cetera.
+#
+# Got that? Look at src/interfaces/libpq/Makefile for an example.
+#
+# While the linker allows creation of most shared libraries,
+# -Bsymbolic requires resolution of all symbols, making the
+# compiler a better choice for shared library creation on ELF platforms.
+# With the linker, -Bsymbolic requires the crt1.o startup object file.
+# bjm 2001-02-10
+
+
+COMPILER = $(CC) $(CFLAGS)
+LINK.static = $(AR) $(AROPT)
+
+
+
+# Insert -L from LDFLAGS after any -L already present in SHLIB_LINK
+SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) $(filter -L%, $(LDFLAGS)) $(filter-out -L%, $(SHLIB_LINK))
+
+# Need a -L-free version of LDFLAGS to use in combination with SHLIB_LINK
+LDFLAGS_NO_L = $(filter-out -L%, $(LDFLAGS))
+
+ifdef SO_MAJOR_VERSION
+# Default library naming convention used by the majority of platforms
+ifeq ($(enable_shared), yes)
+shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+shlib_major = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+shlib_bare = lib$(NAME)$(DLSUFFIX)
+endif
+# Testing the soname variable is a reliable way to determine whether a
+# linkable library is being built.
+soname = $(shlib_major)
+else
+# Naming convention for dynamically loadable modules
+ifeq ($(enable_shared), yes)
+shlib = $(NAME)$(DLSUFFIX)
+endif
+endif
+stlib = lib$(NAME).a
+
+ifndef soname
+# additional flags for backend modules
+SHLIB_LINK := $(BE_DLLLIBS) $(SHLIB_LINK)
+endif
+
+# For each platform we support shared libraries on, set shlib to the
+# name of the library (if default above is not right), set
+# LINK.shared to the command to link the library,
+# and adjust SHLIB_LINK if necessary.
+
+# Try to keep the sections in some kind of order, folks...
+
+override CFLAGS += $(CFLAGS_SL)
+ifdef SO_MAJOR_VERSION
+# libraries ought to use this to refer to versioned gettext domain names
+override CPPFLAGS += -DSO_MAJOR_VERSION=$(SO_MAJOR_VERSION)
+endif
+
+ifeq ($(PORTNAME), aix)
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+ endif
+ haslibarule = yes
+ exports_file = lib$(NAME).exp
+endif
+
+ifeq ($(PORTNAME), darwin)
+ ifdef soname
+ # linkable library
+ DLSUFFIX = .dylib
+ ifneq ($(SO_MAJOR_VERSION), 0)
+ version_link = -compatibility_version $(SO_MAJOR_VERSION) -current_version $(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+ endif
+ LINK.shared = $(COMPILER) -dynamiclib -install_name $(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) $(version_link) $(exported_symbols_list) -multiply_defined suppress
+ shlib = lib$(NAME).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)$(DLSUFFIX)
+ shlib_major = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)
+ else
+ # loadable module
+ DLSUFFIX = .so
+ LINK.shared = $(COMPILER) -bundle -multiply_defined suppress
+ endif
+ BUILD.exports = $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@
+ exports_file = $(SHLIB_EXPORTS:%.txt=%.list)
+ ifneq (,$(exports_file))
+ exported_symbols_list = -exported_symbols_list $(exports_file)
+ endif
+endif
+
+ifeq ($(PORTNAME), openbsd)
+ ifdef ELF_SYSTEM
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-x,-soname,$(soname)
+ endif
+ SHLIB_LINK += -lc
+ else
+ LINK.shared = $(LD) -x -Bshareable -Bforcearchive
+ endif
+endif
+
+ifeq ($(PORTNAME), bsdi)
+ ifeq ($(DLSUFFIX), .so)
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-x,-soname,$(soname)
+ endif
+ SHLIB_LINK += -lc
+ endif
+ ifeq ($(DLSUFFIX), .o)
+ LINK.shared = shlicc -O $(LDREL)
+ endif
+endif
+
+ifeq ($(PORTNAME), freebsd)
+ ifdef ELF_SYSTEM
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+ endif
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-x,-soname,$(soname)
+ endif
+ else
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+ endif
+ LINK.shared = $(LD) -x -Bshareable -Bforcearchive
+ endif
+endif
+
+ifeq ($(PORTNAME), netbsd)
+ ifdef ELF_SYSTEM
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-x,-soname,$(soname)
+ endif
+ else
+ LINK.shared = $(LD) -x -Bshareable -Bforcearchive
+ endif
+endif
+
+ifeq ($(PORTNAME), hpux)
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+ endif
+ ifeq ($(with_gnu_ld), yes)
+ LINK.shared = $(CC) $(LDFLAGS_NO_L) -shared
+ ifdef soname
+ LINK.shared += -Wl,-h -Wl,$(soname)
+ endif
+ else
+ # can't use the CC-syntax rpath pattern here
+ rpath =
+ LINK.shared = $(LD) -b
+ ifdef soname
+ LINK.shared += +h $(soname)
+ endif
+ ifeq ($(enable_rpath), yes)
+ LINK.shared += +b '$(rpathdir)'
+ endif
+ # On HPUX platforms, gcc is usually configured to search for libraries
+ # in /usr/local/lib, but ld won't do so. Add an explicit -L switch so
+ # ld can find the same libraries gcc does. Make sure it goes after any
+ # -L switches provided explicitly.
+ ifeq ($(GCC), yes)
+ SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) -L/usr/local/lib $(filter-out -L%, $(SHLIB_LINK))
+ endif
+ endif
+ # do this last so above filtering doesn't pull out -L switches in LDFLAGS
+ ifeq ($(GCC), yes)
+ SHLIB_LINK += `$(CC) $(LDFLAGS) -print-libgcc-file-name`
+ endif
+endif
+
+ifeq ($(PORTNAME), irix)
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+ endif
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-set_version,sgi$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+ endif
+endif
+
+ifeq ($(PORTNAME), linux)
+ LINK.shared = $(COMPILER) -shared
+ ifdef soname
+ LINK.shared += -Wl,-soname,$(soname)
+ endif
+ BUILD.exports = ( echo '{ global:'; $(AWK) '/^[^\#]/ {printf "%s;\n",$$1}' $<; echo ' local: *; };' ) >$@
+ exports_file = $(SHLIB_EXPORTS:%.txt=%.list)
+ ifneq (,$(exports_file))
+ LINK.shared += -Wl,--version-script=$(exports_file)
+ endif
+endif
+
+ifeq ($(PORTNAME), solaris)
+ ifeq ($(GCC), yes)
+ LINK.shared = $(COMPILER) -shared
+ else
+ LINK.shared = $(COMPILER) -G
+ endif
+ ifdef soname
+ ifeq ($(with_gnu_ld), yes)
+ LINK.shared += -Wl,-soname,$(soname)
+ else
+ LINK.shared += -h $(soname)
+ endif
+ endif
+endif
+
+ifeq ($(PORTNAME), sunos4)
+ LINK.shared = $(LD) -assert pure-text -Bdynamic
+endif
+
+ifeq ($(PORTNAME), osf)
+ LINK.shared = $(LD) -shared -expect_unresolved '*'
+endif
+
+ifeq ($(PORTNAME), sco)
+ ifeq ($(GCC), yes)
+ LINK.shared = $(CC) -shared
+ else
+ LINK.shared = $(CC) -G
+ endif
+ LINK.shared += -Wl,-z,text
+ ifdef soname
+ LINK.shared += -Wl,-h,$(soname)
+ endif
+endif
+
+ifeq ($(PORTNAME), svr4)
+ LINK.shared = $(LD) -G
+endif
+
+ifeq ($(PORTNAME), univel)
+ LINK.shared = $(LD) -G -z text
+endif
+
+ifeq ($(PORTNAME), unixware)
+ ifeq ($(GCC), yes)
+ LINK.shared = $(CC) -shared
+ else
+ LINK.shared = $(CC) -G
+ endif
+ LINK.shared += -Wl,-z,text
+ ifdef soname
+ LINK.shared += -Wl,-h,$(soname)
+ endif
+endif
+
+ifeq ($(PORTNAME), cygwin)
+ ifdef SO_MAJOR_VERSION
+ shlib = cyg$(NAME)$(DLSUFFIX)
+ endif
+ haslibarule = yes
+endif
+
+ifeq ($(PORTNAME), win32)
+ ifdef SO_MAJOR_VERSION
+ shlib = lib$(NAME)$(DLSUFFIX)
+ endif
+ haslibarule = yes
+endif
+
+ifeq ($(enable_rpath), yes)
+SHLIB_LINK += $(rpath)
+endif
+
+
+
+##
+## BUILD
+##
+
+.PHONY: all-lib all-static-lib all-shared-lib
+
+all-lib: all-shared-lib
+ifdef soname
+# no static library when building a dynamically loadable module
+all-lib: all-static-lib
+endif
+
+all-static-lib: $(stlib)
+
+all-shared-lib: $(shlib)
+
+ifndef haslibarule
+$(stlib): $(OBJS)
+ $(LINK.static) $@ $^
+ $(RANLIB) $@
+endif #haslibarule
+
+ifeq ($(enable_shared), yes)
+
+ifeq (,$(filter cygwin win32,$(PORTNAME)))
+ifneq ($(PORTNAME), aix)
+
+# Normal case
+$(shlib): $(OBJS)
+ $(LINK.shared) $(LDFLAGS_SL) $(OBJS) $(SHLIB_LINK) -o $@
+ifdef shlib_major
+# If we're using major and minor versions, then make a symlink to major-version-only.
+ifneq ($(shlib), $(shlib_major))
+ rm -f $(shlib_major)
+ $(LN_S) $(shlib) $(shlib_major)
+endif
+# Make sure we have a link to a name without any version numbers
+ifneq ($(shlib), $(shlib_bare))
+ rm -f $(shlib_bare)
+ $(LN_S) $(shlib) $(shlib_bare)
+endif
+endif # shlib_major
+
+# Where possible, restrict the symbols exported by the library to just the
+# official list, so as to avoid unintentional ABI changes. On recent Darwin
+# this also quiets multiply-defined-symbol warnings in programs that use
+# libpgport along with libpq.
+ifneq (,$(SHLIB_EXPORTS))
+ifdef BUILD.exports
+$(shlib): $(SHLIB_EXPORTS:%.txt=%.list)
+
+$(SHLIB_EXPORTS:%.txt=%.list): %.list: %.txt
+ $(BUILD.exports)
+endif
+endif
+
+else # PORTNAME == aix
+
+# AIX case
+$(shlib) $(stlib): $(OBJS)
+ $(LINK.static) $(stlib) $^
+ $(RANLIB) $(stlib)
+ $(MKLDEXPORT) $(stlib) >$(exports_file)
+ $(COMPILER) $(LDFLAGS_NO_L) $(LDFLAGS_SL) -o $(shlib) $(stlib) -Wl,-bE:$(exports_file) $(SHLIB_LINK)
+ rm -f $(stlib)
+ $(AR) $(AROPT) $(stlib) $(shlib)
+
+endif # PORTNAME == aix
+
+else # PORTNAME == cygwin || PORTNAME == win32
+
+# Cygwin or Win32 case
+
+# If SHLIB_EXPORTS is set, the rules below will build a .def file from
+# that. Else we build a temporary one here.
+ifeq (,$(SHLIB_EXPORTS))
+DLL_DEFFILE = lib$(NAME)dll.def
+exports_file = $(DLL_DEFFILE)
+
+$(exports_file): $(OBJS)
+ $(DLLTOOL) --export-all $(DLLTOOL_DEFFLAGS) --output-def $@ $^
+else
+DLL_DEFFILE = $(srcdir)/lib$(NAME)dll.def
+endif
+
+$(shlib): $(OBJS) $(DLL_DEFFILE)
+ $(DLLWRAP) $(LDFLAGS_SL) -o $@ --dllname $(shlib) $(DLLWRAP_FLAGS) --def $(DLL_DEFFILE) $(OBJS) $(SHLIB_LINK)
+
+$(stlib): $(shlib) $(DLL_DEFFILE)
+ $(DLLTOOL) --dllname $(shlib) $(DLLTOOL_LIBFLAGS) --def $(DLL_DEFFILE) --output-lib $@
+
+endif # PORTNAME == cygwin || PORTNAME == win32
+
+endif # enable_shared
+
+
+# We need several not-quite-identical variants of .DEF files to build
+# DLLs for Windows. These are made from the single source file
+# exports.txt. Since we can't assume that Windows boxes will have
+# sed, the .DEF files are always built and included in distribution
+# tarballs.
+
+ifneq (,$(SHLIB_EXPORTS))
+distprep: $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def
+
+UC_NAME = $(shell echo $(NAME) | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+$(srcdir)/lib$(NAME)dll.def: $(SHLIB_EXPORTS)
+ echo '; DEF file for MS VC++' >$@
+ echo 'LIBRARY LIB$(UC_NAME)' >>$@
+ echo 'EXPORTS' >>$@
+ sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@
+
+$(srcdir)/lib$(NAME)ddll.def: $(SHLIB_EXPORTS)
+ echo '; DEF file for MS VC++' >$@
+ echo 'LIBRARY LIB$(UC_NAME)D' >>$@
+ echo 'EXPORTS' >>$@
+ sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@
+
+$(srcdir)/blib$(NAME)dll.def: $(SHLIB_EXPORTS)
+ echo '; DEF file for Borland C++ Builder' >$@
+ echo 'LIBRARY BLIB$(UC_NAME)' >>$@
+ echo 'EXPORTS' >>$@
+ sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ _\1@ \2/' $< >>$@
+ echo >>$@
+ echo '; Aliases for MS compatible names' >> $@
+ sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1= _\1/' $< | sed 's/ *$$//' >>$@
+endif # SHLIB_EXPORTS
+
+
+##
+## INSTALL
+##
+
+.PHONY: install-lib install-lib-static install-lib-shared installdirs-lib
+install-lib: install-lib-shared
+ifdef soname
+install-lib: install-lib-static
+endif
+
+install-lib-static: $(stlib) installdirs-lib
+ $(INSTALL_STLIB) $< '$(DESTDIR)$(libdir)/$(stlib)'
+ifeq ($(PORTNAME), darwin)
+ cd '$(DESTDIR)$(libdir)' && \
+ ranlib $(stlib)
+endif
+
+ifeq ($(enable_shared), yes)
+install-lib-shared: $(shlib) installdirs-lib
+ifdef soname
+# we don't install $(shlib) on AIX
+# (see http://archives.postgresql.org/message-id/52EF20B2E3209443BC37736D00C3C1380A6E79FE@EXADV1.host.magwien.gv.at)
+ifneq ($(PORTNAME), aix)
+ $(INSTALL_SHLIB) $< '$(DESTDIR)$(libdir)/$(shlib)'
+ifneq ($(PORTNAME), cygwin)
+ifneq ($(PORTNAME), win32)
+ifneq ($(shlib), $(shlib_major))
+ cd '$(DESTDIR)$(libdir)' && \
+ rm -f $(shlib_major) && \
+ $(LN_S) $(shlib) $(shlib_major)
+endif
+ifneq ($(shlib), $(shlib_bare))
+ cd '$(DESTDIR)$(libdir)' && \
+ rm -f $(shlib_bare) && \
+ $(LN_S) $(shlib) $(shlib_bare)
+endif
+endif # not win32
+endif # not cygwin
+endif # not aix
+else # no soname
+ $(INSTALL_SHLIB) $< '$(DESTDIR)$(pkglibdir)/$(shlib)'
+endif
+else # not enable_shared
+ifndef soname
+install-lib-shared:
+ @echo "*****"; \
+ echo "* Module $(NAME) was not installed due to lack of shared library support."; \
+ echo "*****"
+endif
+endif # enable_shared
+
+
+installdirs-lib:
+ifdef soname
+ $(mkinstalldirs) '$(DESTDIR)$(libdir)'
+else
+ $(mkinstalldirs) '$(DESTDIR)$(pkglibdir)'
+endif
+
+
+##
+## UNINSTALL
+##
+
+.PHONY: uninstall-lib
+uninstall-lib:
+ifdef soname
+ rm -f '$(DESTDIR)$(libdir)/$(stlib)'
+ifeq ($(enable_shared), yes)
+ rm -f '$(DESTDIR)$(libdir)/$(shlib_bare)' \
+ '$(DESTDIR)$(libdir)/$(shlib_major)' \
+ '$(DESTDIR)$(libdir)/$(shlib)'
+endif # enable_shared
+else # no soname
+ rm -f '$(DESTDIR)$(pkglibdir)/$(shlib)'
+endif # no soname
+
+
+##
+## CLEAN
+##
+
+.PHONY: clean-lib
+clean-lib:
+ rm -f $(shlib) $(shlib_bare) $(shlib_major) $(stlib) $(exports_file)
+
+ifneq (,$(SHLIB_EXPORTS))
+maintainer-clean-lib:
+ rm -f $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def
+endif
diff --git a/src/gtm/README b/src/gtm/README
new file mode 100644
index 0000000000..77cff3695b
--- /dev/null
+++ b/src/gtm/README
@@ -0,0 +1,61 @@
+
+Global Transaction Manager (GTM)
+--------------------------------
+
+1. Source code layout:
+----------------------
+
+The server side code is located in the "include", "common" and
+"main" directories. The "include" directory hosts all the header
+files some of which are also shared by the client.
+
+The "common" directory contains the infrastructure pieces for the
+server such as error reporting, memory management, locking etc.
+Most of the server side logic including message processing,
+transaction management, thread and connection management is hosted
+in the "main" directory.
+
+The client side code is put in the "client" directory including all
+client side infrastructure and test programs.
+
+
+2. Building GTM Server and Clients:
+-----------------------------------
+
+Go to the top level directory (where this README is located) and run
+the make command to build the sources.
+
+$ make
+
+This would build the GTM server in the "main" directory and client
+libraries in the "client" directory.
+
+You may want to change the following two defines in main/main.c
+
+#define GTM_DEFAULT_HOSTNAME "localhost"
+#define GTM_DEFAULT_PORT 6666
+
+
+3. Running the GTM Server:
+---------------------------
+
+You can run the GTM server by running the following command from the
+top level directory.
+
+$ ./main/gtm
+
+The server will start listening on port 6666 for incoming connections.
+
+
+4. Building test clients:
+-------------------------
+
+Go to the "client/test" directory and run make to build the test clients.
+
+$ cd client/test
+$ make
+
+This would build various test clients, statically linking to the libgtmclient.a
+library in the client directory. You may need to change the connect string
+appropriately connect to the GTM server.
+
diff --git a/src/gtm/client/Makefile b/src/gtm/client/Makefile
new file mode 100644
index 0000000000..216adf2207
--- /dev/null
+++ b/src/gtm/client/Makefile
@@ -0,0 +1,26 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+
+NAME=gtmclient
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=fe-misc.o fe-connect.o pqexpbuffer.o ip.o strlcpy.o gtm_client.o fe-protocol.o
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+ rm -f $(OBJS)
+ rm -f libgtmclient.a libgtmclient.so libgtmclient.so.1 libgtmclient.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
new file mode 100644
index 0000000000..29d8fe4cc5
--- /dev/null
+++ b/src/gtm/client/fe-connect.c
@@ -0,0 +1,1287 @@
+/*-------------------------------------------------------------------------
+ *
+ * fe-connect.c
+ * functions related to setting up a connection to the backend
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/interfaces/libpq/fe-connect.c,v 1.371 2008/12/15 10:28:21 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "gtm/gtm_ip.h"
+#include "gtm/gtm_msg.h"
+
+/* fall back options if they are not specified by arguments or defined
+ by environment variables */
+#define DefaultHost "localhost"
+
+/* ----------
+ * Definition of the conninfo parameters and their fallback resources.
+ *
+ * GTMPQconninfoOptions[] is a constant static array that we use to initialize
+ * a dynamically allocated working copy. All the "val" fields in
+ * GTMPQconninfoOptions[] *must* be NULL. In a working copy, non-null "val"
+ * fields point to malloc'd strings that should be freed when the working
+ * array is freed (see GTMPQconninfoFree).
+ * ----------
+ */
+static const GTMPQconninfoOption GTMPQconninfoOptions[] = {
+ {"connect_timeout", NULL},
+ {"host", NULL},
+ {"hostaddr", NULL},
+ {"port", NULL},
+ {"coordinator_id", NULL},
+ {"proxy", NULL},
+ /* Terminating entry --- MUST BE LAST */
+ {NULL, NULL}
+};
+
+static bool connectOptions1(GTM_Conn *conn, const char *conninfo);
+static int connectGTMStart(GTM_Conn *conn);
+static int connectGTMComplete(GTM_Conn *conn);
+static GTM_Conn *makeEmptyGTM_Conn(void);
+static void freeGTM_Conn(GTM_Conn *conn);
+static void closeGTM_Conn(GTM_Conn *conn);
+static GTMPQconninfoOption *conninfo_parse(const char *conninfo,
+ PQExpBuffer errorMessage, bool use_defaults);
+static char *conninfo_getval(GTMPQconninfoOption *connOptions,
+ const char *keyword);
+
+static int pqPacketSend(GTM_Conn *conn, char packet_type,
+ const void *buf, size_t buf_len);
+
+GTM_Conn *
+PQconnectGTM(const char *conninfo)
+{
+ GTM_Conn *conn = PQconnectGTMStart(conninfo);
+
+ if (conn && conn->status != CONNECTION_BAD)
+ (void) connectGTMComplete(conn);
+
+ return conn;
+}
+
+/*
+ * PQconnectGTMStart
+ *
+ * Returns a GTM_Conn*. If NULL is returned, a malloc error has occurred, and
+ * you should not attempt to proceed with this connection. If the status
+ * field of the connection returned is CONNECTION_BAD, an error has
+ * occurred. In this case you should call GTMPQfinish on the result, (perhaps
+ * inspecting the error message first). Other fields of the structure may not
+ * be valid if that occurs. If the status field is not CONNECTION_BAD, then
+ * this stage has succeeded - call GTMPQconnectPoll, using select(2) to see when
+ * this is necessary.
+ *
+ * See GTMPQconnectPoll for more info.
+ */
+GTM_Conn *
+PQconnectGTMStart(const char *conninfo)
+{
+ GTM_Conn *conn;
+
+ /*
+ * Allocate memory for the conn structure
+ */
+ conn = makeEmptyGTM_Conn();
+ if (conn == NULL)
+ return NULL;
+
+ /*
+ * Parse the conninfo string
+ */
+ if (!connectOptions1(conn, conninfo))
+ return conn;
+
+ /*
+ * Connect to the database
+ */
+ if (!connectGTMStart(conn))
+ {
+ /* Just in case we failed to set it in connectGTMStart */
+ conn->status = CONNECTION_BAD;
+ }
+
+ return conn;
+}
+
+/*
+ * connectOptions1
+ *
+ * Internal subroutine to set up connection parameters given an already-
+ * created GTM_Conn and a conninfo string.
+ *
+ * Returns true if OK, false if trouble (in which case errorMessage is set
+ * and so is conn->status).
+ */
+static bool
+connectOptions1(GTM_Conn *conn, const char *conninfo)
+{
+ GTMPQconninfoOption *connOptions;
+ char *tmp;
+
+ /*
+ * Parse the conninfo string
+ */
+ connOptions = conninfo_parse(conninfo, &conn->errorMessage, true);
+ if (connOptions == NULL)
+ {
+ conn->status = CONNECTION_BAD;
+ /* errorMessage is already set */
+ return false;
+ }
+
+ /*
+ * Move option values into conn structure
+ *
+ * XXX: probably worth checking strdup() return value here...
+ */
+ tmp = conninfo_getval(connOptions, "hostaddr");
+ conn->pghostaddr = tmp ? strdup(tmp) : NULL;
+ tmp = conninfo_getval(connOptions, "host");
+ conn->pghost = tmp ? strdup(tmp) : NULL;
+ tmp = conninfo_getval(connOptions, "port");
+ conn->pgport = tmp ? strdup(tmp) : NULL;
+ tmp = conninfo_getval(connOptions, "connect_timeout");
+ conn->connect_timeout = tmp ? strdup(tmp) : NULL;
+ tmp = conninfo_getval(connOptions, "coordinator_id");
+ conn->coordinator_id = tmp ? strdup(tmp) : NULL;
+ tmp = conninfo_getval(connOptions, "proxy");
+ conn->is_proxy = tmp ? atoi(tmp) : 0;
+
+ /*
+ * Free the option info - all is in conn now
+ */
+ GTMPQconninfoFree(connOptions);
+
+ return true;
+}
+
+
+/* ----------
+ * connectNoDelay -
+ * Sets the TCP_NODELAY socket option.
+ * Returns 1 if successful, 0 if not.
+ * ----------
+ */
+static int
+connectNoDelay(GTM_Conn *conn)
+{
+#ifdef TCP_NODELAY
+ int on = 1;
+
+ if (setsockopt(conn->sock, IPPROTO_TCP, TCP_NODELAY,
+ (char *) &on,
+ sizeof(on)) < 0)
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not set socket to TCP no delay mode: \n");
+ return 0;
+ }
+#endif
+
+ return 1;
+}
+
+
+/* ----------
+ * connectFailureMessage -
+ * create a friendly error message on connection failure.
+ * ----------
+ */
+static void
+connectFailureMessage(GTM_Conn *conn, int errorno)
+{
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not connect to server: \n"
+ "\tIs the server running on host \"%s\" and accepting\n"
+ "\tTCP/IP connections on port %s?\n",
+ conn->pghostaddr
+ ? conn->pghostaddr
+ : (conn->pghost
+ ? conn->pghost
+ : "???"),
+ conn->pgport);
+ }
+}
+
+
+/* ----------
+ * connectGTMStart -
+ * Begin the process of making a connection to the backend.
+ *
+ * Returns 1 if successful, 0 if not.
+ * ----------
+ */
+static int
+connectGTMStart(GTM_Conn *conn)
+{
+ int portnum;
+ char portstr[128];
+ struct addrinfo *addrs = NULL;
+ struct addrinfo hint;
+ const char *node;
+ int ret;
+
+ if (!conn)
+ return 0;
+
+ /* Ensure our buffers are empty */
+ conn->inStart = conn->inCursor = conn->inEnd = 0;
+ conn->outCount = 0;
+
+ /*
+ * Determine the parameters to pass to gtm_getaddrinfo_all.
+ */
+
+ /* Initialize hint structure */
+ MemSet(&hint, 0, sizeof(hint));
+ hint.ai_socktype = SOCK_STREAM;
+ hint.ai_family = AF_UNSPEC;
+
+ /* Set up port number as a string */
+ if (conn->pgport != NULL && conn->pgport[0] != '\0')
+ portnum = atoi(conn->pgport);
+ snprintf(portstr, sizeof(portstr), "%d", portnum);
+
+ if (conn->pghostaddr != NULL && conn->pghostaddr[0] != '\0')
+ {
+ /* Using pghostaddr avoids a hostname lookup */
+ node = conn->pghostaddr;
+ hint.ai_family = AF_UNSPEC;
+ hint.ai_flags = AI_NUMERICHOST;
+ }
+ else if (conn->pghost != NULL && conn->pghost[0] != '\0')
+ {
+ /* Using pghost, so we have to look-up the hostname */
+ node = conn->pghost;
+ hint.ai_family = AF_UNSPEC;
+ }
+ else
+ {
+ /* Without Unix sockets, default to localhost instead */
+ node = "localhost";
+ hint.ai_family = AF_UNSPEC;
+ }
+
+ /* Use gtm_getaddrinfo_all() to resolve the address */
+ ret = gtm_getaddrinfo_all(node, portstr, &hint, &addrs);
+ if (ret || !addrs)
+ {
+ if (node)
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not translate host name \"%s\" to address: %s\n",
+ node, gai_strerror(ret));
+ else
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not translate Unix-domain socket path \"%s\" to address: %s\n",
+ portstr, gai_strerror(ret));
+ if (addrs)
+ gtm_freeaddrinfo_all(hint.ai_family, addrs);
+ goto connect_errReturn;
+ }
+
+ /*
+ * Set up to try to connect, with protocol 3.0 as the first attempt.
+ */
+ conn->addrlist = addrs;
+ conn->addr_cur = addrs;
+ conn->addrlist_family = hint.ai_family;
+ conn->status = CONNECTION_NEEDED;
+
+ /*
+ * The code for processing CONNECTION_NEEDED state is in GTMPQconnectPoll(),
+ * so that it can easily be re-executed if needed again during the
+ * asynchronous startup process. However, we must run it once here,
+ * because callers expect a success return from this routine to mean that
+ * we are in PGRES_POLLING_WRITING connection state.
+ */
+ if (GTMPQconnectPoll(conn) == PGRES_POLLING_WRITING)
+ return 1;
+
+connect_errReturn:
+ if (conn->sock >= 0)
+ {
+ close(conn->sock);
+ conn->sock = -1;
+ }
+ conn->status = CONNECTION_BAD;
+ return 0;
+}
+
+
+/*
+ * connectGTMComplete
+ *
+ * Block and complete a connection.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+connectGTMComplete(GTM_Conn *conn)
+{
+ GTMClientPollingStatusType flag = PGRES_POLLING_WRITING;
+ time_t finish_time = ((time_t) -1);
+
+ if (conn == NULL || conn->status == CONNECTION_BAD)
+ return 0;
+
+ /*
+ * Set up a time limit, if connect_timeout isn't zero.
+ */
+ if (conn->connect_timeout != NULL)
+ {
+ int timeout = atoi(conn->connect_timeout);
+
+ if (timeout > 0)
+ {
+ /*
+ * Rounding could cause connection to fail; need at least 2 secs
+ */
+ if (timeout < 2)
+ timeout = 2;
+ /* calculate the finish time based on start + timeout */
+ finish_time = time(NULL) + timeout;
+ }
+ }
+
+ for (;;)
+ {
+ /*
+ * Wait, if necessary. Note that the initial state (just after
+ * PQconnectGTMStart) is to wait for the socket to select for writing.
+ */
+ switch (flag)
+ {
+ case PGRES_POLLING_OK:
+ /* Reset stored error messages since we now have a working connection */
+ resetGTMPQExpBuffer(&conn->errorMessage);
+ return 1; /* success! */
+
+ case PGRES_POLLING_READING:
+ if (gtmpqWaitTimed(1, 0, conn, finish_time))
+ {
+ conn->status = CONNECTION_BAD;
+ return 0;
+ }
+ break;
+
+ case PGRES_POLLING_WRITING:
+ if (gtmpqWaitTimed(0, 1, conn, finish_time))
+ {
+ conn->status = CONNECTION_BAD;
+ return 0;
+ }
+ break;
+
+ default:
+ /* Just in case we failed to set it in GTMPQconnectPoll */
+ conn->status = CONNECTION_BAD;
+ return 0;
+ }
+
+ /*
+ * Now try to advance the state machine.
+ */
+ flag = GTMPQconnectPoll(conn);
+ }
+}
+
+/* ----------------
+ * GTMPQconnectPoll
+ *
+ * Poll an asynchronous connection.
+ *
+ * Returns a GTMClientPollingStatusType.
+ * Before calling this function, use select(2) to determine when data
+ * has arrived..
+ *
+ * You must call GTMPQfinish whether or not this fails.
+ */
+GTMClientPollingStatusType
+GTMPQconnectPoll(GTM_Conn *conn)
+{
+ if (conn == NULL)
+ return PGRES_POLLING_FAILED;
+
+ /* Get the new data */
+ switch (conn->status)
+ {
+ /*
+ * We really shouldn't have been polled in these two cases, but we
+ * can handle it.
+ */
+ case CONNECTION_BAD:
+ return PGRES_POLLING_FAILED;
+ case CONNECTION_OK:
+ return PGRES_POLLING_OK;
+
+ /* These are reading states */
+ case CONNECTION_AWAITING_RESPONSE:
+ case CONNECTION_AUTH_OK:
+ {
+ /* Load waiting data */
+ int n = gtmpqReadData(conn);
+
+ if (n < 0)
+ goto error_return;
+ if (n == 0)
+ return PGRES_POLLING_READING;
+
+ break;
+ }
+
+ /* These are writing states, so we just proceed. */
+ case CONNECTION_STARTED:
+ case CONNECTION_MADE:
+ break;
+
+ case CONNECTION_NEEDED:
+ break;
+
+ default:
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "invalid connection state, "
+ "probably indicative of memory corruption\n"
+ );
+ goto error_return;
+ }
+
+
+keep_going: /* We will come back to here until there is
+ * nothing left to do. */
+ switch (conn->status)
+ {
+ case CONNECTION_NEEDED:
+ {
+ /*
+ * Try to initiate a connection to one of the addresses
+ * returned by gtm_getaddrinfo_all(). conn->addr_cur is the
+ * next one to try. We fail when we run out of addresses
+ * (reporting the error returned for the *last* alternative,
+ * which may not be what users expect :-().
+ */
+ while (conn->addr_cur != NULL)
+ {
+ struct addrinfo *addr_cur = conn->addr_cur;
+
+ /* Remember current address for possible error msg */
+ memcpy(&conn->raddr.addr, addr_cur->ai_addr,
+ addr_cur->ai_addrlen);
+ conn->raddr.salen = addr_cur->ai_addrlen;
+
+ /* Open a socket */
+ conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0);
+ if (conn->sock < 0)
+ {
+ /*
+ * ignore socket() failure if we have more addresses
+ * to try
+ */
+ if (addr_cur->ai_next != NULL)
+ {
+ conn->addr_cur = addr_cur->ai_next;
+ continue;
+ }
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not create socket: \n");
+ break;
+ }
+
+ /*
+ * Select socket options: no delay of outgoing data for
+ * TCP sockets, nonblock mode, close-on-exec. Fail if any
+ * of this fails.
+ */
+ if (!IS_AF_UNIX(addr_cur->ai_family))
+ {
+ if (!connectNoDelay(conn))
+ {
+ close(conn->sock);
+ conn->sock = -1;
+ conn->addr_cur = addr_cur->ai_next;
+ continue;
+ }
+ }
+
+ /*
+ * Start/make connection. This should not block, since we
+ * are in nonblock mode. If it does, well, too bad.
+ */
+ if (connect(conn->sock, addr_cur->ai_addr,
+ addr_cur->ai_addrlen) < 0)
+ {
+ if (SOCK_ERRNO == EINPROGRESS ||
+ SOCK_ERRNO == EWOULDBLOCK ||
+ SOCK_ERRNO == EINTR ||
+ SOCK_ERRNO == 0)
+ {
+ /*
+ * This is fine - we're in non-blocking mode, and
+ * the connection is in progress. Tell caller to
+ * wait for write-ready on socket.
+ */
+ conn->status = CONNECTION_STARTED;
+ return PGRES_POLLING_WRITING;
+ }
+ /* otherwise, trouble */
+ }
+ else
+ {
+ /*
+ * Hm, we're connected already --- seems the "nonblock
+ * connection" wasn't. Advance the state machine and
+ * go do the next stuff.
+ */
+ conn->status = CONNECTION_STARTED;
+ goto keep_going;
+ }
+
+ /*
+ * This connection failed --- set up error report, then
+ * close socket (do it this way in case close() affects
+ * the value of errno...). We will ignore the connect()
+ * failure and keep going if there are more addresses.
+ */
+ connectFailureMessage(conn, SOCK_ERRNO);
+ if (conn->sock >= 0)
+ {
+ close(conn->sock);
+ conn->sock = -1;
+ }
+
+ /*
+ * Try the next address, if any.
+ */
+ conn->addr_cur = addr_cur->ai_next;
+ } /* loop over addresses */
+
+ /*
+ * Ooops, no more addresses. An appropriate error message is
+ * already set up, so just set the right status.
+ */
+ goto error_return;
+ }
+
+ case CONNECTION_STARTED:
+ {
+ int optval;
+ size_t optlen = sizeof(optval);
+
+ /*
+ * Write ready, since we've made it here, so the connection
+ * has been made ... or has failed.
+ */
+
+ /*
+ * Now check (using getsockopt) that there is not an error
+ * state waiting for us on the socket.
+ */
+
+ if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR,
+ (char *) &optval, &optlen) == -1)
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ libpq_gettext("could not get socket error status: \n"));
+ goto error_return;
+ }
+ else if (optval != 0)
+ {
+ /*
+ * When using a nonblocking connect, we will typically see
+ * connect failures at this point, so provide a friendly
+ * error message.
+ */
+ connectFailureMessage(conn, optval);
+
+ /*
+ * If more addresses remain, keep trying, just as in the
+ * case where connect() returned failure immediately.
+ */
+ if (conn->addr_cur->ai_next != NULL)
+ {
+ if (conn->sock >= 0)
+ {
+ close(conn->sock);
+ conn->sock = -1;
+ }
+ conn->addr_cur = conn->addr_cur->ai_next;
+ conn->status = CONNECTION_NEEDED;
+ goto keep_going;
+ }
+ goto error_return;
+ }
+
+ /* Fill in the client address */
+ conn->laddr.salen = sizeof(conn->laddr.addr);
+ if (getsockname(conn->sock,
+ (struct sockaddr *) & conn->laddr.addr,
+ &conn->laddr.salen) < 0)
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not get client address from socket:\n");
+ goto error_return;
+ }
+
+ /*
+ * Make sure we can write before advancing to next step.
+ */
+ conn->status = CONNECTION_MADE;
+ return PGRES_POLLING_WRITING;
+ }
+
+ case CONNECTION_MADE:
+ {
+ GTM_StartupPacket sp;
+
+ /*
+ * Build a startup packet. We tell the GTM server/proxy our
+ * coordinator ID and whether we are a proxy or not.
+ *
+ * When the connection is made from the proxy, we let the GTM
+ * server know about it so that some special headers are
+ * handled correctly by the server.
+ */
+ sp.sp_cid = atoi(conn->coordinator_id);
+ sp.sp_isproxy = conn->is_proxy;
+
+ /*
+ * Send the startup packet.
+ *
+ * Theoretically, this could block, but it really shouldn't
+ * since we only got here if the socket is write-ready.
+ */
+ if (pqPacketSend(conn, 'A', &sp,
+ sizeof (GTM_StartupPacket)) != STATUS_OK)
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "could not send startup packet: \n");
+ goto error_return;
+ }
+
+ conn->status = CONNECTION_AWAITING_RESPONSE;
+ return PGRES_POLLING_READING;
+ }
+
+ /*
+ * Handle authentication exchange: wait for postmaster messages
+ * and respond as necessary.
+ */
+ case CONNECTION_AWAITING_RESPONSE:
+ {
+ char beresp;
+
+ /*
+ * Scan the message from current point (note that if we find
+ * the message is incomplete, we will return without advancing
+ * inStart, and resume here next time).
+ */
+ conn->inCursor = conn->inStart;
+
+ /* Read type byte */
+ if (gtmpqGetc(&beresp, conn))
+ {
+ /* We'll come back when there is more data */
+ return PGRES_POLLING_READING;
+ }
+
+ /*
+ * Validate message type: we expect only an authentication
+ * request or an error here. Anything else probably means
+ * it's not GTM on the other end at all.
+ */
+ if (!(beresp == 'R' || beresp == 'E'))
+ {
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "expected authentication request from "
+ "server, but received %c\n",
+ beresp);
+ goto error_return;
+ }
+
+
+ /* Handle errors. */
+ if (beresp == 'E')
+ {
+ if (gtmpqGets_append(&conn->errorMessage, conn))
+ {
+ /* We'll come back when there is more data */
+ return PGRES_POLLING_READING;
+ }
+ /* OK, we read the message; mark data consumed */
+ conn->inStart = conn->inCursor;
+ goto error_return;
+ }
+
+ {
+ /*
+ * Server sends a dummy message body of size 4 bytes
+ */
+ int tmp_int;
+ gtmpqGetInt(&tmp_int, 4, conn);
+ }
+
+ /*
+ * OK, we successfully read the message; mark data consumed
+ */
+ conn->inStart = conn->inCursor;
+
+ /* We are done with authentication exchange */
+ conn->status = CONNECTION_AUTH_OK;
+
+ /* Look to see if we have more data yet. */
+ goto keep_going;
+ }
+
+ case CONNECTION_AUTH_OK:
+ {
+ /* We can release the address list now. */
+ gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist);
+ conn->addrlist = NULL;
+ conn->addr_cur = NULL;
+
+ /* Otherwise, we are open for business! */
+ conn->status = CONNECTION_OK;
+ return PGRES_POLLING_OK;
+ }
+
+
+ default:
+ appendGTMPQExpBuffer(&conn->errorMessage,
+ "invalid connection state %c, "
+ "probably indicative of memory corruption\n"
+ ,
+ conn->status);
+ goto error_return;
+ }
+
+ /* Unreachable */
+
+error_return:
+
+ /*
+ * We used to close the socket at this point, but that makes it awkward
+ * for those above us if they wish to remove this socket from their own
+ * records (an fd_set for example). We'll just have this socket closed
+ * when GTMPQfinish is called (which is compulsory even after an error, since
+ * the connection structure must be freed).
+ */
+ conn->status = CONNECTION_BAD;
+ return PGRES_POLLING_FAILED;
+}
+
+
+/*
+ * makeEmptyGTM_Conn
+ * - create a GTM_Conn data structure with (as yet) no interesting data
+ */
+static GTM_Conn *
+makeEmptyGTM_Conn(void)
+{
+ GTM_Conn *conn;
+
+ conn = (GTM_Conn *) malloc(sizeof(GTM_Conn));
+ if (conn == NULL)
+ return conn;
+
+ /* Zero all pointers and booleans */
+ MemSet(conn, 0, sizeof(GTM_Conn));
+
+ conn->status = CONNECTION_BAD;
+
+ /*
+ * We try to send at least 8K at a time, which is the usual size of pipe
+ * buffers on Unix systems. That way, when we are sending a large amount
+ * of data, we avoid incurring extra kernel context swaps for partial
+ * bufferloads. The output buffer is initially made 16K in size, and we
+ * try to dump it after accumulating 8K.
+ *
+ * With the same goal of minimizing context swaps, the input buffer will
+ * be enlarged anytime it has less than 8K free, so we initially allocate
+ * twice that.
+ */
+ conn->inBufSize = 16 * 1024;
+ conn->inBuffer = (char *) malloc(conn->inBufSize);
+ conn->outBufSize = 16 * 1024;
+ conn->outBuffer = (char *) malloc(conn->outBufSize);
+ initGTMPQExpBuffer(&conn->errorMessage);
+ initGTMPQExpBuffer(&conn->workBuffer);
+
+ if (conn->inBuffer == NULL ||
+ conn->outBuffer == NULL ||
+ PQExpBufferBroken(&conn->errorMessage) ||
+ PQExpBufferBroken(&conn->workBuffer))
+ {
+ /* out of memory already :-( */
+ freeGTM_Conn(conn);
+ conn = NULL;
+ }
+
+ return conn;
+}
+
+/*
+ * freeGTM_Conn
+ * - free an idle (closed) GTM_Conn data structure
+ *
+ * NOTE: this should not overlap any functionality with closeGTM_Conn().
+ * Clearing/resetting of transient state belongs there; what we do here is
+ * release data that is to be held for the life of the GTM_Conn structure.
+ * If a value ought to be cleared/freed during PQreset(), do it there not here.
+ */
+static void
+freeGTM_Conn(GTM_Conn *conn)
+{
+ if (conn->pghost)
+ free(conn->pghost);
+ if (conn->pghostaddr)
+ free(conn->pghostaddr);
+ if (conn->pgport)
+ free(conn->pgport);
+ if (conn->connect_timeout)
+ free(conn->connect_timeout);
+ if (conn->inBuffer)
+ free(conn->inBuffer);
+ if (conn->outBuffer)
+ free(conn->outBuffer);
+ termGTMPQExpBuffer(&conn->errorMessage);
+ termGTMPQExpBuffer(&conn->workBuffer);
+
+ free(conn);
+}
+
+/*
+ * closeGTM_Conn
+ * - properly close a connection to the backend
+ *
+ * This should reset or release all transient state, but NOT the connection
+ * parameters. On exit, the GTM_Conn should be in condition to start a fresh
+ * connection with the same parameters (see PQreset()).
+ */
+static void
+closeGTM_Conn(GTM_Conn *conn)
+{
+ /*
+ * Note that the protocol doesn't allow us to send Terminate messages
+ * during the startup phase.
+ */
+ if (conn->sock >= 0 && conn->status == CONNECTION_OK)
+ {
+ /*
+ * Try to send "close connection" message to backend. Ignore any
+ * error.
+ *
+ * Force length word for backends may try to read that in a generic
+ * code
+ */
+ gtmpqPutMsgStart('X', true, conn);
+ gtmpqPutMsgEnd(conn);
+ gtmpqFlush(conn);
+ }
+
+ /*
+ * Close the connection, reset all transient state, flush I/O buffers.
+ */
+ if (conn->sock >= 0)
+ close(conn->sock);
+ conn->sock = -1;
+ conn->status = CONNECTION_BAD; /* Well, not really _bad_ - just
+ * absent */
+ gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist);
+ conn->addrlist = NULL;
+ conn->addr_cur = NULL;
+ conn->inStart = conn->inCursor = conn->inEnd = 0;
+ conn->outCount = 0;
+}
+
+/*
+ * GTMPQfinish: properly close a connection to the backend. Also frees
+ * the GTM_Conn data structure so it shouldn't be re-used after this.
+ */
+void
+GTMPQfinish(GTM_Conn *conn)
+{
+ if (conn)
+ {
+ closeGTM_Conn(conn);
+ freeGTM_Conn(conn);
+ }
+}
+
+/*
+ * pqPacketSend() -- convenience routine to send a message to server.
+ *
+ * pack_type: the single-byte message type code. (Pass zero for startup
+ * packets, which have no message type code.)
+ *
+ * buf, buf_len: contents of message. The given length includes only what
+ * is in buf; the message type and message length fields are added here.
+ *
+ * RETURNS: STATUS_ERROR if the write fails, STATUS_OK otherwise.
+ * SIDE_EFFECTS: may block.
+ *
+ * Note: all messages sent with this routine have a length word, whether
+ * it's protocol 2.0 or 3.0.
+ */
+static int
+pqPacketSend(GTM_Conn *conn, char pack_type,
+ const void *buf, size_t buf_len)
+{
+ /* Start the message. */
+ if (gtmpqPutMsgStart(pack_type, true, conn))
+ return STATUS_ERROR;
+
+ /* Send the message body. */
+ if (gtmpqPutnchar(buf, buf_len, conn))
+ return STATUS_ERROR;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ return STATUS_ERROR;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ return STATUS_ERROR;
+
+ return STATUS_OK;
+}
+
+
+/*
+ * GTMPQconninfoParse
+ *
+ * Parse a string like PQconnectGTM() would do and return the
+ * resulting connection options array. NULL is returned on failure.
+ * The result contains only options specified directly in the string,
+ * not any possible default values.
+ *
+ * If errmsg isn't NULL, *errmsg is set to NULL on success, or a malloc'd
+ * string on failure (use PQfreemem to free it). In out-of-memory conditions
+ * both *errmsg and the result could be NULL.
+ *
+ * NOTE: the returned array is dynamically allocated and should
+ * be freed when no longer needed via GTMPQconninfoFree().
+ */
+GTMPQconninfoOption *
+GTMPQconninfoParse(const char *conninfo, char **errmsg)
+{
+ PQExpBufferData errorBuf;
+ GTMPQconninfoOption *connOptions;
+
+ if (errmsg)
+ *errmsg = NULL; /* default */
+ initGTMPQExpBuffer(&errorBuf);
+ if (PQExpBufferBroken(&errorBuf))
+ return NULL; /* out of memory already :-( */
+ connOptions = conninfo_parse(conninfo, &errorBuf, false);
+ if (connOptions == NULL && errmsg)
+ *errmsg = errorBuf.data;
+ else
+ termGTMPQExpBuffer(&errorBuf);
+ return connOptions;
+}
+
+/*
+ * Conninfo parser routine
+ *
+ * If successful, a malloc'd GTMPQconninfoOption array is returned.
+ * If not successful, NULL is returned and an error message is
+ * left in errorMessage.
+ * Defaults are supplied (from a service file, environment variables, etc)
+ * for unspecified options, but only if use_defaults is TRUE.
+ */
+static GTMPQconninfoOption *
+conninfo_parse(const char *conninfo, PQExpBuffer errorMessage,
+ bool use_defaults)
+{
+ char *pname;
+ char *pval;
+ char *buf;
+ char *cp;
+ char *cp2;
+ GTMPQconninfoOption *options;
+ GTMPQconninfoOption *option;
+
+ /* Make a working copy of GTMPQconninfoOptions */
+ options = malloc(sizeof(GTMPQconninfoOptions));
+ if (options == NULL)
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("out of memory\n"));
+ return NULL;
+ }
+ memcpy(options, GTMPQconninfoOptions, sizeof(GTMPQconninfoOptions));
+
+ /* Need a modifiable copy of the input string */
+ if ((buf = strdup(conninfo)) == NULL)
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("out of memory\n"));
+ GTMPQconninfoFree(options);
+ return NULL;
+ }
+ cp = buf;
+
+ while (*cp)
+ {
+ /* Skip blanks before the parameter name */
+ if (isspace((unsigned char) *cp))
+ {
+ cp++;
+ continue;
+ }
+
+ /* Get the parameter name */
+ pname = cp;
+ while (*cp)
+ {
+ if (*cp == '=')
+ break;
+ if (isspace((unsigned char) *cp))
+ {
+ *cp++ = '\0';
+ while (*cp)
+ {
+ if (!isspace((unsigned char) *cp))
+ break;
+ cp++;
+ }
+ break;
+ }
+ cp++;
+ }
+
+ /* Check that there is a following '=' */
+ if (*cp != '=')
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("missing \"=\" after \"%s\" in connection info string\n"),
+ pname);
+ GTMPQconninfoFree(options);
+ free(buf);
+ return NULL;
+ }
+ *cp++ = '\0';
+
+ /* Skip blanks after the '=' */
+ while (*cp)
+ {
+ if (!isspace((unsigned char) *cp))
+ break;
+ cp++;
+ }
+
+ /* Get the parameter value */
+ pval = cp;
+
+ if (*cp != '\'')
+ {
+ cp2 = pval;
+ while (*cp)
+ {
+ if (isspace((unsigned char) *cp))
+ {
+ *cp++ = '\0';
+ break;
+ }
+ if (*cp == '\\')
+ {
+ cp++;
+ if (*cp != '\0')
+ *cp2++ = *cp++;
+ }
+ else
+ *cp2++ = *cp++;
+ }
+ *cp2 = '\0';
+ }
+ else
+ {
+ cp2 = pval;
+ cp++;
+ for (;;)
+ {
+ if (*cp == '\0')
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("unterminated quoted string in connection info string\n"));
+ GTMPQconninfoFree(options);
+ free(buf);
+ return NULL;
+ }
+ if (*cp == '\\')
+ {
+ cp++;
+ if (*cp != '\0')
+ *cp2++ = *cp++;
+ continue;
+ }
+ if (*cp == '\'')
+ {
+ *cp2 = '\0';
+ cp++;
+ break;
+ }
+ *cp2++ = *cp++;
+ }
+ }
+
+ /*
+ * Now we have the name and the value. Search for the param record.
+ */
+ for (option = options; option->keyword != NULL; option++)
+ {
+ if (strcmp(option->keyword, pname) == 0)
+ break;
+ }
+ if (option->keyword == NULL)
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("invalid connection option \"%s\"\n"),
+ pname);
+ GTMPQconninfoFree(options);
+ free(buf);
+ return NULL;
+ }
+
+ /*
+ * Store the value
+ */
+ if (option->val)
+ free(option->val);
+ option->val = strdup(pval);
+ if (!option->val)
+ {
+ printfGTMPQExpBuffer(errorMessage,
+ libpq_gettext("out of memory\n"));
+ GTMPQconninfoFree(options);
+ free(buf);
+ return NULL;
+ }
+ }
+
+ /* Done with the modifiable input string */
+ free(buf);
+
+ return options;
+}
+
+
+static char *
+conninfo_getval(GTMPQconninfoOption *connOptions,
+ const char *keyword)
+{
+ GTMPQconninfoOption *option;
+
+ for (option = connOptions; option->keyword != NULL; option++)
+ {
+ if (strcmp(option->keyword, keyword) == 0)
+ return option->val;
+ }
+
+ return NULL;
+}
+
+
+void
+GTMPQconninfoFree(GTMPQconninfoOption *connOptions)
+{
+ GTMPQconninfoOption *option;
+
+ if (connOptions == NULL)
+ return;
+
+ for (option = connOptions; option->keyword != NULL; option++)
+ {
+ if (option->val != NULL)
+ free(option->val);
+ }
+ free(connOptions);
+}
+
+char *
+GTMPQhost(const GTM_Conn *conn)
+{
+ if (!conn)
+ return NULL;
+ return conn->pghost;
+}
+
+char *
+GTMPQport(const GTM_Conn *conn)
+{
+ if (!conn)
+ return NULL;
+ return conn->pgport;
+}
+
+ConnStatusType
+GTMPQstatus(const GTM_Conn *conn)
+{
+ if (!conn)
+ return CONNECTION_BAD;
+ return conn->status;
+}
+
+char *
+GTMPQerrorMessage(const GTM_Conn *conn)
+{
+ if (!conn)
+ return libpq_gettext("connection pointer is NULL\n");
+
+ return conn->errorMessage.data;
+}
+
+int
+GTMPQsocket(const GTM_Conn *conn)
+{
+ if (!conn)
+ return -1;
+ return conn->sock;
+}
+
+void
+GTMPQtrace(GTM_Conn *conn, FILE *debug_port)
+{
+ if (conn == NULL)
+ return;
+ GTMPQuntrace(conn);
+ conn->Pfdebug = debug_port;
+}
+
+void
+GTMPQuntrace(GTM_Conn *conn)
+{
+ if (conn == NULL)
+ return;
+ if (conn->Pfdebug)
+ {
+ fflush(conn->Pfdebug);
+ conn->Pfdebug = NULL;
+ }
+}
diff --git a/src/gtm/client/fe-misc.c b/src/gtm/client/fe-misc.c
new file mode 100644
index 0000000000..66172400a5
--- /dev/null
+++ b/src/gtm/client/fe-misc.c
@@ -0,0 +1,1035 @@
+/*-------------------------------------------------------------------------
+ *
+ * FILE
+ * fe-misc.c
+ *
+ * DESCRIPTION
+ * miscellaneous useful functions
+ *
+ * The communication routines here are analogous to the ones in
+ * backend/libpq/pqcomm.c and backend/libpq/pqcomprim.c, but operate
+ * in the considerably different environment of the frontend libpq.
+ * In particular, we work with a bare nonblock-mode socket, rather than
+ * a stdio stream, so that we can avoid unwanted blocking of the application.
+ *
+ * XXX: MOVE DEBUG PRINTOUT TO HIGHER LEVEL. As is, block and restart
+ * will cause repeat printouts.
+ *
+ * We must speak the same transmitted data representations as the backend
+ * routines.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/interfaces/libpq/fe-misc.c,v 1.137 2008/12/11 07:34:09 petere Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <signal.h>
+#include <time.h>
+
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <poll.h>
+#include <sys/poll.h>
+#include <sys/select.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+
+static int gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn);
+static int gtmpqSendSome(GTM_Conn *conn, int len);
+static int gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite,
+ time_t end_time);
+static int gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time);
+
+
+/*
+ * gtmpqGetc: get 1 character from the connection
+ *
+ * All these routines return 0 on success, EOF on error.
+ * Note that for the Get routines, EOF only means there is not enough
+ * data in the buffer, not that there is necessarily a hard error.
+ */
+int
+gtmpqGetc(char *result, GTM_Conn *conn)
+{
+ if (conn->inCursor >= conn->inEnd)
+ return EOF;
+
+ *result = conn->inBuffer[conn->inCursor++];
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "From backend> %c\n", *result);
+
+ return 0;
+}
+
+
+/*
+ * gtmpqPutc: write 1 char to the current message
+ */
+int
+gtmpqPutc(char c, GTM_Conn *conn)
+{
+ if (gtmpqPutMsgBytes(&c, 1, conn))
+ return EOF;
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend> %c\n", c);
+
+ return 0;
+}
+
+
+/*
+ * gtmpqGets[_append]:
+ * get a null-terminated string from the connection,
+ * and store it in an expansible PQExpBuffer.
+ * If we run out of memory, all of the string is still read,
+ * but the excess characters are silently discarded.
+ */
+static int
+gtmpqGets_internal(PQExpBuffer buf, GTM_Conn *conn, bool resetbuffer)
+{
+ /* Copy conn data to locals for faster search loop */
+ char *inBuffer = conn->inBuffer;
+ int inCursor = conn->inCursor;
+ int inEnd = conn->inEnd;
+ int slen;
+
+ while (inCursor < inEnd && inBuffer[inCursor])
+ inCursor++;
+
+ if (inCursor >= inEnd)
+ return EOF;
+
+ slen = inCursor - conn->inCursor;
+
+ if (resetbuffer)
+ resetGTMPQExpBuffer(buf);
+
+ appendBinaryGTMPQExpBuffer(buf, inBuffer + conn->inCursor, slen);
+
+ conn->inCursor = ++inCursor;
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "From backend> \"%s\"\n",
+ buf->data);
+
+ return 0;
+}
+
+int
+gtmpqGets(PQExpBuffer buf, GTM_Conn *conn)
+{
+ return gtmpqGets_internal(buf, conn, true);
+}
+
+int
+gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn)
+{
+ return gtmpqGets_internal(buf, conn, false);
+}
+
+
+/*
+ * gtmpqPuts: write a null-terminated string to the current message
+ */
+int
+gtmpqPuts(const char *s, GTM_Conn *conn)
+{
+ if (gtmpqPutMsgBytes(s, strlen(s) + 1, conn))
+ return EOF;
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend> \"%s\"\n", s);
+
+ return 0;
+}
+
+/*
+ * gtmpqGetnchar:
+ * get a string of exactly len bytes in buffer s, no null termination
+ */
+int
+gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn)
+{
+ if (len < 0 || len > (size_t) (conn->inEnd - conn->inCursor))
+ return EOF;
+
+ memcpy(s, conn->inBuffer + conn->inCursor, len);
+ /* no terminating null */
+
+ conn->inCursor += len;
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "From backend (%lu)> %.*s\n",
+ (unsigned long) len, (int) len, s);
+
+ return 0;
+}
+
+/*
+ * gtmpqPutnchar:
+ * write exactly len bytes to the current message
+ */
+int
+gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn)
+{
+ if (gtmpqPutMsgBytes(s, len, conn))
+ return EOF;
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend> %.*s\n", (int) len, s);
+
+ return 0;
+}
+
+/*
+ * gtmpqGetInt
+ * read a 2 or 4 byte integer and convert from network byte order
+ * to local byte order
+ */
+int
+gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn)
+{
+ uint16 tmp2;
+ uint32 tmp4;
+
+ switch (bytes)
+ {
+ case 2:
+ if (conn->inCursor + 2 > conn->inEnd)
+ return EOF;
+ memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+ conn->inCursor += 2;
+ *result = (int) ntohs(tmp2);
+ break;
+ case 4:
+ if (conn->inCursor + 4 > conn->inEnd)
+ return EOF;
+ memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+ conn->inCursor += 4;
+ *result = (int) ntohl(tmp4);
+ break;
+ default:
+ fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes);
+ return EOF;
+ }
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "From backend (#%lu)> %d\n", (unsigned long) bytes, *result);
+
+ return 0;
+}
+
+/*
+ * gtmpqPutInt
+ * write an integer of 2 or 4 bytes, converting from host byte order
+ * to network byte order.
+ */
+int
+gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn)
+{
+ uint16 tmp2;
+ uint32 tmp4;
+
+ switch (bytes)
+ {
+ case 2:
+ tmp2 = htons((uint16) value);
+ if (gtmpqPutMsgBytes((const char *) &tmp2, 2, conn))
+ return EOF;
+ break;
+ case 4:
+ tmp4 = htonl((uint32) value);
+ if (gtmpqPutMsgBytes((const char *) &tmp4, 4, conn))
+ return EOF;
+ break;
+ default:
+ fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes);
+ return EOF;
+ }
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend (%lu#)> %d\n", (unsigned long) bytes, value);
+
+ return 0;
+}
+
+/*
+ * Make sure conn's output buffer can hold bytes_needed bytes (caller must
+ * include already-stored data into the value!)
+ *
+ * Returns 0 on success, EOF if failed to enlarge buffer
+ */
+int
+gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn)
+{
+ int newsize = conn->outBufSize;
+ char *newbuf;
+
+ if (bytes_needed <= (size_t) newsize)
+ return 0;
+
+ /*
+ * If we need to enlarge the buffer, we first try to double it in size; if
+ * that doesn't work, enlarge in multiples of 8K. This avoids thrashing
+ * the malloc pool by repeated small enlargements.
+ *
+ * Note: tests for newsize > 0 are to catch integer overflow.
+ */
+ do
+ {
+ newsize *= 2;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = realloc(conn->outBuffer, newsize);
+ if (newbuf)
+ {
+ /* realloc succeeded */
+ conn->outBuffer = newbuf;
+ conn->outBufSize = newsize;
+ return 0;
+ }
+ }
+
+ newsize = conn->outBufSize;
+ do
+ {
+ newsize += 8192;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = realloc(conn->outBuffer, newsize);
+ if (newbuf)
+ {
+ /* realloc succeeded */
+ conn->outBuffer = newbuf;
+ conn->outBufSize = newsize;
+ return 0;
+ }
+ }
+
+ /* realloc failed. Probably out of memory */
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "cannot allocate memory for output buffer\n");
+ return EOF;
+}
+
+/*
+ * Make sure conn's input buffer can hold bytes_needed bytes (caller must
+ * include already-stored data into the value!)
+ *
+ * Returns 0 on success, EOF if failed to enlarge buffer
+ */
+int
+gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn)
+{
+ int newsize = conn->inBufSize;
+ char *newbuf;
+
+ if (bytes_needed <= (size_t) newsize)
+ return 0;
+
+ /*
+ * If we need to enlarge the buffer, we first try to double it in size; if
+ * that doesn't work, enlarge in multiples of 8K. This avoids thrashing
+ * the malloc pool by repeated small enlargements.
+ *
+ * Note: tests for newsize > 0 are to catch integer overflow.
+ */
+ do
+ {
+ newsize *= 2;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = realloc(conn->inBuffer, newsize);
+ if (newbuf)
+ {
+ /* realloc succeeded */
+ conn->inBuffer = newbuf;
+ conn->inBufSize = newsize;
+ return 0;
+ }
+ }
+
+ newsize = conn->inBufSize;
+ do
+ {
+ newsize += 8192;
+ } while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+ if (newsize > 0 && bytes_needed <= (size_t) newsize)
+ {
+ newbuf = realloc(conn->inBuffer, newsize);
+ if (newbuf)
+ {
+ /* realloc succeeded */
+ conn->inBuffer = newbuf;
+ conn->inBufSize = newsize;
+ return 0;
+ }
+ }
+
+ /* realloc failed. Probably out of memory */
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "cannot allocate memory for input buffer\n");
+ return EOF;
+}
+
+/*
+ * gtmpqPutMsgStart: begin construction of a message to the server
+ *
+ * msg_type is the message type byte, or 0 for a message without type byte
+ * (only startup messages have no type byte)
+ *
+ * force_len forces the message to have a length word; otherwise, we add
+ * a length word if protocol 3.
+ *
+ * Returns 0 on success, EOF on error
+ *
+ * The idea here is that we construct the message in conn->outBuffer,
+ * beginning just past any data already in outBuffer (ie, at
+ * outBuffer+outCount). We enlarge the buffer as needed to hold the message.
+ * When the message is complete, we fill in the length word (if needed) and
+ * then advance outCount past the message, making it eligible to send.
+ *
+ * The state variable conn->outMsgStart points to the incomplete message's
+ * length word: it is either outCount or outCount+1 depending on whether
+ * there is a type byte. If we are sending a message without length word
+ * (pre protocol 3.0 only), then outMsgStart is -1. The state variable
+ * conn->outMsgEnd is the end of the data collected so far.
+ */
+int
+gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn)
+{
+ int lenPos;
+ int endPos;
+
+ /* allow room for message type byte */
+ if (msg_type)
+ endPos = conn->outCount + 1;
+ else
+ endPos = conn->outCount;
+
+ /* do we want a length word? */
+ if (force_len)
+ {
+ lenPos = endPos;
+ /* allow room for message length */
+ endPos += 4;
+ }
+ else
+ lenPos = -1;
+
+ /* make sure there is room for message header */
+ if (gtmpqCheckOutBufferSpace(endPos, conn))
+ return EOF;
+ /* okay, save the message type byte if any */
+ if (msg_type)
+ conn->outBuffer[conn->outCount] = msg_type;
+ /* set up the message pointers */
+ conn->outMsgStart = lenPos;
+ conn->outMsgEnd = endPos;
+ /* length word, if needed, will be filled in by gtmpqPutMsgEnd */
+
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend> Msg %c\n",
+ msg_type ? msg_type : ' ');
+
+ return 0;
+}
+
+/*
+ * gtmpqPutMsgBytes: add bytes to a partially-constructed message
+ *
+ * Returns 0 on success, EOF on error
+ */
+static int
+gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn)
+{
+ /* make sure there is room for it */
+ if (gtmpqCheckOutBufferSpace(conn->outMsgEnd + len, conn))
+ return EOF;
+ /* okay, save the data */
+ memcpy(conn->outBuffer + conn->outMsgEnd, buf, len);
+ conn->outMsgEnd += len;
+ /* no Pfdebug call here, caller should do it */
+ return 0;
+}
+
+/*
+ * gtmpqPutMsgEnd: finish constructing a message and possibly send it
+ *
+ * Returns 0 on success, EOF on error
+ *
+ * We don't actually send anything here unless we've accumulated at least
+ * 8K worth of data (the typical size of a pipe buffer on Unix systems).
+ * This avoids sending small partial packets. The caller must use gtmpqFlush
+ * when it's important to flush all the data out to the server.
+ */
+int
+gtmpqPutMsgEnd(GTM_Conn *conn)
+{
+ if (conn->Pfdebug)
+ fprintf(conn->Pfdebug, "To backend> Msg complete, length %u\n",
+ conn->outMsgEnd - conn->outCount);
+
+ /* Fill in length word if needed */
+ if (conn->outMsgStart >= 0)
+ {
+ uint32 msgLen = conn->outMsgEnd - conn->outMsgStart;
+
+ msgLen = htonl(msgLen);
+ memcpy(conn->outBuffer + conn->outMsgStart, &msgLen, 4);
+ }
+
+ /* Make message eligible to send */
+ conn->outCount = conn->outMsgEnd;
+
+ if (conn->outCount >= 8192)
+ {
+ int toSend = conn->outCount - (conn->outCount % 8192);
+
+ if (gtmpqSendSome(conn, toSend) < 0)
+ return EOF;
+ /* in nonblock mode, don't complain if unable to send it all */
+ }
+
+ return 0;
+}
+
+/* ----------
+ * gtmpqReadData: read more data, if any is available
+ * Possible return values:
+ * 1: successfully loaded at least one more byte
+ * 0: no data is presently available, but no error detected
+ * -1: error detected (including EOF = connection closure);
+ * conn->errorMessage set
+ * NOTE: callers must not assume that pointers or indexes into conn->inBuffer
+ * remain valid across this call!
+ * ----------
+ */
+int
+gtmpqReadData(GTM_Conn *conn)
+{
+ int someread = 0;
+ int nread;
+
+ if (conn->sock < 0)
+ {
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "connection not open\n");
+ return -1;
+ }
+
+ /* Left-justify any data in the buffer to make room */
+ if (conn->inStart < conn->inEnd)
+ {
+ if (conn->inStart > 0)
+ {
+ memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+ conn->inEnd - conn->inStart);
+ conn->inEnd -= conn->inStart;
+ conn->inCursor -= conn->inStart;
+ conn->inStart = 0;
+ }
+ }
+ else
+ {
+ /* buffer is logically empty, reset it */
+ conn->inStart = conn->inCursor = conn->inEnd = 0;
+ }
+
+ /*
+ * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+ * the buffer in case a single message exceeds the initial buffer size. We
+ * enlarge before filling the buffer entirely so as to avoid asking the
+ * kernel for a partial packet. The magic constant here should be large
+ * enough for a TCP packet or Unix pipe bufferload. 8K is the usual pipe
+ * buffer size, so...
+ */
+ if (conn->inBufSize - conn->inEnd < 8192)
+ {
+ if (gtmpqCheckInBufferSpace(conn->inEnd + (size_t) 8192, conn))
+ {
+ /*
+ * We don't insist that the enlarge worked, but we need some room
+ */
+ if (conn->inBufSize - conn->inEnd < 100)
+ return -1; /* errorMessage already set */
+ }
+ }
+
+ /* OK, try to read some data */
+retry3:
+ nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+ conn->inBufSize - conn->inEnd, 0);
+ if (nread < 0)
+ {
+ if (SOCK_ERRNO == EINTR)
+ goto retry3;
+ /* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+ if (SOCK_ERRNO == EAGAIN)
+ return someread;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ if (SOCK_ERRNO == EWOULDBLOCK)
+ return someread;
+#endif
+ /* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+ if (SOCK_ERRNO == ECONNRESET)
+ goto definitelyFailed;
+#endif
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "could not receive data from server:\n");
+ return -1;
+ }
+ if (nread > 0)
+ {
+ conn->inEnd += nread;
+
+ /*
+ * Hack to deal with the fact that some kernels will only give us back
+ * 1 packet per recv() call, even if we asked for more and there is
+ * more available. If it looks like we are reading a long message,
+ * loop back to recv() again immediately, until we run out of data or
+ * buffer space. Without this, the block-and-restart behavior of
+ * libpq's higher levels leads to O(N^2) performance on long messages.
+ *
+ * Since we left-justified the data above, conn->inEnd gives the
+ * amount of data already read in the current message. We consider
+ * the message "long" once we have acquired 32k ...
+ */
+ if (conn->inEnd > 32768 &&
+ (conn->inBufSize - conn->inEnd) >= 8192)
+ {
+ someread = 1;
+ goto retry3;
+ }
+ return 1;
+ }
+
+ if (someread)
+ return 1; /* got a zero read after successful tries */
+
+ /*
+ * A return value of 0 could mean just that no data is now available, or
+ * it could mean EOF --- that is, the server has closed the connection.
+ * Since we have the socket in nonblock mode, the only way to tell the
+ * difference is to see if select() is saying that the file is ready.
+ * Grumble. Fortunately, we don't expect this path to be taken much,
+ * since in normal practice we should not be trying to read data unless
+ * the file selected for reading already.
+ *
+ * In SSL mode it's even worse: SSL_read() could say WANT_READ and then
+ * data could arrive before we make the gtmpqReadReady() test. So we must
+ * play dumb and assume there is more data, relying on the SSL layer to
+ * detect true EOF.
+ */
+
+#ifdef USE_SSL
+ if (conn->ssl)
+ return 0;
+#endif
+
+ switch (gtmpqReadReady(conn))
+ {
+ case 0:
+ /* definitely no data available */
+ return 0;
+ case 1:
+ /* ready for read */
+ break;
+ default:
+ goto definitelyFailed;
+ }
+
+ /*
+ * Still not sure that it's EOF, because some data could have just
+ * arrived.
+ */
+retry4:
+ nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+ conn->inBufSize - conn->inEnd, 0);
+ if (nread < 0)
+ {
+ if (SOCK_ERRNO == EINTR)
+ goto retry4;
+ /* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+ if (SOCK_ERRNO == EAGAIN)
+ return 0;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ if (SOCK_ERRNO == EWOULDBLOCK)
+ return 0;
+#endif
+ /* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+ if (SOCK_ERRNO == ECONNRESET)
+ goto definitelyFailed;
+#endif
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "could not receive data from server: \n");
+ return -1;
+ }
+ if (nread > 0)
+ {
+ conn->inEnd += nread;
+ return 1;
+ }
+
+ /*
+ * OK, we are getting a zero read even though select() says ready. This
+ * means the connection has been closed. Cope.
+ */
+definitelyFailed:
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "server closed the connection unexpectedly\n"
+ "\tThis probably means the server terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+ conn->status = CONNECTION_BAD; /* No more connection to backend */
+ close(conn->sock);
+ conn->sock = -1;
+
+ return -1;
+}
+
+/*
+ * gtmpqSendSome: send data waiting in the output buffer.
+ *
+ * len is how much to try to send (typically equal to outCount, but may
+ * be less).
+ *
+ * Return 0 on success, -1 on failure and 1 when not all data could be sent
+ * because the socket would block and the connection is non-blocking.
+ */
+static int
+gtmpqSendSome(GTM_Conn *conn, int len)
+{
+ char *ptr = conn->outBuffer;
+ int remaining = conn->outCount;
+ int result = 0;
+
+ if (conn->sock < 0)
+ {
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "connection not open\n");
+ return -1;
+ }
+
+ /* while there's still data to send */
+ while (len > 0)
+ {
+ int sent;
+
+ sent = send(conn->sock, ptr, len, 0);
+
+ if (sent < 0)
+ {
+ /*
+ * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+ * EPIPE or ECONNRESET, assume we've lost the backend connection
+ * permanently.
+ */
+ switch (SOCK_ERRNO)
+ {
+#ifdef EAGAIN
+ case EAGAIN:
+ break;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ case EWOULDBLOCK:
+ break;
+#endif
+ case EINTR:
+ continue;
+
+ case EPIPE:
+#ifdef ECONNRESET
+ case ECONNRESET:
+#endif
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "server closed the connection unexpectedly\n"
+ "\tThis probably means the server terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+
+ /*
+ * We used to close the socket here, but that's a bad idea
+ * since there might be unread data waiting (typically, a
+ * NOTICE message from the backend telling us it's
+ * committing hara-kiri...). Leave the socket open until
+ * gtmpqReadData finds no more data can be read. But abandon
+ * attempt to send data.
+ */
+ conn->outCount = 0;
+ return -1;
+
+ default:
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "could not send data to server: \n");
+ /* We don't assume it's a fatal error... */
+ conn->outCount = 0;
+ return -1;
+ }
+ }
+ else
+ {
+ ptr += sent;
+ len -= sent;
+ remaining -= sent;
+ }
+
+ if (len > 0)
+ {
+ /*
+ * We didn't send it all, wait till we can send more.
+ *
+ * If the connection is in non-blocking mode we don't wait, but
+ * return 1 to indicate that data is still pending.
+ */
+ result = 1;
+ break;
+ }
+ }
+
+ /* shift the remaining contents of the buffer */
+ if (remaining > 0)
+ memmove(conn->outBuffer, ptr, remaining);
+ conn->outCount = remaining;
+
+ return result;
+}
+
+
+/*
+ * gtmpqFlush: send any data waiting in the output buffer
+ *
+ * Return 0 on success, -1 on failure and 1 when not all data could be sent
+ * because the socket would block and the connection is non-blocking.
+ */
+int
+gtmpqFlush(GTM_Conn *conn)
+{
+ if (conn->Pfdebug)
+ fflush(conn->Pfdebug);
+
+ if (conn->outCount > 0)
+ return gtmpqSendSome(conn, conn->outCount);
+
+ return 0;
+}
+
+
+/*
+ * gtmpqWait: wait until we can read or write the connection socket
+ *
+ * JAB: If SSL enabled and used and forRead, buffered bytes short-circuit the
+ * call to select().
+ *
+ * We also stop waiting and return if the kernel flags an exception condition
+ * on the socket. The actual error condition will be detected and reported
+ * when the caller tries to read or write the socket.
+ */
+int
+gtmpqWait(int forRead, int forWrite, GTM_Conn *conn)
+{
+ return gtmpqWaitTimed(forRead, forWrite, conn, (time_t) -1);
+}
+
+/*
+ * gtmpqWaitTimed: wait, but not past finish_time.
+ *
+ * If finish_time is exceeded then we return failure (EOF). This is like
+ * the response for a kernel exception because we don't want the caller
+ * to try to read/write in that case.
+ *
+ * finish_time = ((time_t) -1) disables the wait limit.
+ */
+int
+gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn, time_t finish_time)
+{
+ int result;
+
+ result = gtmpqSocketCheck(conn, forRead, forWrite, finish_time);
+
+ if (result < 0)
+ return EOF; /* errorMessage is already set */
+
+ if (result == 0)
+ {
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "timeout expired\n");
+ return EOF;
+ }
+
+ return 0;
+}
+
+/*
+ * gtmpqReadReady: is select() saying the file is ready to read?
+ * Returns -1 on failure, 0 if not ready, 1 if ready.
+ */
+int
+gtmpqReadReady(GTM_Conn *conn)
+{
+ return gtmpqSocketCheck(conn, 1, 0, (time_t) 0);
+}
+
+/*
+ * gtmpqWriteReady: is select() saying the file is ready to write?
+ * Returns -1 on failure, 0 if not ready, 1 if ready.
+ */
+int
+gtmpqWriteReady(GTM_Conn *conn)
+{
+ return gtmpqSocketCheck(conn, 0, 1, (time_t) 0);
+}
+
+/*
+ * Checks a socket, using poll or select, for data to be read, written,
+ * or both. Returns >0 if one or more conditions are met, 0 if it timed
+ * out, -1 if an error occurred.
+ *
+ * If SSL is in use, the SSL buffer is checked prior to checking the socket
+ * for read data directly.
+ */
+static int
+gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite, time_t end_time)
+{
+ int result;
+
+ if (!conn)
+ return -1;
+ if (conn->sock < 0)
+ {
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "socket not open\n");
+ return -1;
+ }
+
+#ifdef USE_SSL
+ /* Check for SSL library buffering read bytes */
+ if (forRead && conn->ssl && SSL_pending(conn->ssl) > 0)
+ {
+ /* short-circuit the select */
+ return 1;
+ }
+#endif
+
+ /* We will retry as long as we get EINTR */
+ do
+ result = gtmpqSocketPoll(conn->sock, forRead, forWrite, end_time);
+ while (result < 0 && SOCK_ERRNO == EINTR);
+
+ if (result < 0)
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "select() failed: \n");
+
+ return result;
+}
+
+
+/*
+ * Check a file descriptor for read and/or write data, possibly waiting.
+ * If neither forRead nor forWrite are set, immediately return a timeout
+ * condition (without waiting). Return >0 if condition is met, 0
+ * if a timeout occurred, -1 if an error or interrupt occurred.
+ *
+ * Timeout is infinite if end_time is -1. Timeout is immediate (no blocking)
+ * if end_time is 0 (or indeed, any time before now).
+ */
+static int
+gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time)
+{
+ /* We use poll(2) if available, otherwise select(2) */
+#ifdef HAVE_POLL
+ struct pollfd input_fd;
+ int timeout_ms;
+
+ if (!forRead && !forWrite)
+ return 0;
+
+ input_fd.fd = sock;
+ input_fd.events = POLLERR;
+ input_fd.revents = 0;
+
+ if (forRead)
+ input_fd.events |= POLLIN;
+ if (forWrite)
+ input_fd.events |= POLLOUT;
+
+ /* Compute appropriate timeout interval */
+ if (end_time == ((time_t) -1))
+ timeout_ms = -1;
+ else
+ {
+ time_t now = time(NULL);
+
+ if (end_time > now)
+ timeout_ms = (end_time - now) * 1000;
+ else
+ timeout_ms = 0;
+ }
+
+ return poll(&input_fd, 1, timeout_ms);
+#else /* !HAVE_POLL */
+
+ fd_set input_mask;
+ fd_set output_mask;
+ fd_set except_mask;
+ struct timeval timeout;
+ struct timeval *ptr_timeout;
+
+ if (!forRead && !forWrite)
+ return 0;
+
+ FD_ZERO(&input_mask);
+ FD_ZERO(&output_mask);
+ FD_ZERO(&except_mask);
+ if (forRead)
+ FD_SET(sock, &input_mask);
+ if (forWrite)
+ FD_SET(sock, &output_mask);
+ FD_SET(sock, &except_mask);
+
+ /* Compute appropriate timeout interval */
+ if (end_time == ((time_t) -1))
+ ptr_timeout = NULL;
+ else
+ {
+ time_t now = time(NULL);
+
+ if (end_time > now)
+ timeout.tv_sec = end_time - now;
+ else
+ timeout.tv_sec = 0;
+ timeout.tv_usec = 0;
+ ptr_timeout = &timeout;
+ }
+
+ return select(sock + 1, &input_mask, &output_mask,
+ &except_mask, ptr_timeout);
+#endif /* HAVE_POLL */
+}
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
new file mode 100644
index 0000000000..f3960daeaa
--- /dev/null
+++ b/src/gtm/client/fe-protocol.c
@@ -0,0 +1,598 @@
+/*-------------------------------------------------------------------------
+ *
+ * fe-protocol3.c
+ * functions that are specific to frontend/backend protocol version 3
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+
+#include <ctype.h>
+#include <fcntl.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+#include "gtm/gtm_client.h"
+
+#include <unistd.h>
+#include <netinet/in.h>
+
+
+/*
+ * This macro lists the backend message types that could be "long" (more
+ * than a couple of kilobytes).
+ */
+#define VALID_LONG_MESSAGE_TYPE(id) \
+ ((id) == 'S' || (id) == 'E')
+
+static void handleSyncLoss(GTM_Conn *conn, char id, int msgLength);
+static GTM_Result *pqParseInput(GTM_Conn *conn);
+static int gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result);
+static int gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn);
+
+/*
+ * parseInput: if appropriate, parse input data from backend
+ * until input is exhausted or a stopping state is reached.
+ * Note that this function will NOT attempt to read more data from the backend.
+ */
+static GTM_Result *
+pqParseInput(GTM_Conn *conn)
+{
+ char id;
+ int msgLength;
+ int avail;
+ GTM_Result *result = NULL;
+
+ if (conn->result == NULL)
+ {
+ conn->result = (GTM_Result *) malloc(sizeof (GTM_Result));
+ memset(conn->result, 0, sizeof (GTM_Result));
+ }
+ else
+ gtmpqFreeResultData(conn->result, conn->is_proxy);
+
+ result = conn->result;
+
+ /*
+ * Try to read a message. First get the type code and length. Return
+ * if not enough data.
+ */
+ conn->inCursor = conn->inStart;
+ if (gtmpqGetc(&id, conn))
+ return NULL;
+ if (gtmpqGetInt(&msgLength, 4, conn))
+ return NULL;
+
+ /*
+ * Try to validate message type/length here. A length less than 4 is
+ * definitely broken. Large lengths should only be believed for a few
+ * message types.
+ */
+ if (msgLength < 4)
+ {
+ handleSyncLoss(conn, id, msgLength);
+ return NULL;
+ }
+ if (msgLength > 30000 && !VALID_LONG_MESSAGE_TYPE(id))
+ {
+ handleSyncLoss(conn, id, msgLength);
+ return NULL;
+ }
+
+ /*
+ * Can't process if message body isn't all here yet.
+ */
+ conn->result->gr_msglen = msgLength -= 4;
+ avail = conn->inEnd - conn->inCursor;
+ if (avail < msgLength)
+ {
+ /*
+ * Before returning, enlarge the input buffer if needed to hold
+ * the whole message. This is better than leaving it to
+ * gtmpqReadData because we can avoid multiple cycles of realloc()
+ * when the message is large; also, we can implement a reasonable
+ * recovery strategy if we are unable to make the buffer big
+ * enough.
+ */
+ if (gtmpqCheckInBufferSpace(conn->inCursor + (size_t) msgLength,
+ conn))
+ {
+ /*
+ * XXX add some better recovery code... plan is to skip over
+ * the message using its length, then report an error. For the
+ * moment, just treat this like loss of sync (which indeed it
+ * might be!)
+ */
+ handleSyncLoss(conn, id, msgLength);
+ }
+ return NULL;
+ }
+
+ switch (id)
+ {
+ case 'S': /* command complete */
+ if (gtmpqParseSuccess(conn, result))
+ return NULL;
+ break;
+
+ case 'E': /* error return */
+ if (gtmpqGetError(conn, result))
+ return NULL;
+ result->gr_status = -1;
+ break;
+ default:
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "unexpected response from server; first received character was \"%c\"\n",
+ id);
+ conn->inCursor += msgLength;
+ break;
+ } /* switch on protocol character */
+ /* Successfully consumed this message */
+ if (conn->inCursor == conn->inStart + 5 + msgLength)
+ {
+ /* Normal case: parsing agrees with specified length */
+ conn->inStart = conn->inCursor;
+ }
+ else
+ {
+ /* Trouble --- report it */
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "message contents do not agree with length in message type \"%c\"\n",
+ id);
+ /* trust the specified message length as what to skip */
+ conn->inStart += 5 + msgLength;
+ }
+
+ return result;
+}
+
+/*
+ * handleSyncLoss: clean up after loss of message-boundary sync
+ *
+ * There isn't really a lot we can do here except abandon the connection.
+ */
+static void
+handleSyncLoss(GTM_Conn *conn, char id, int msgLength)
+{
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "lost synchronization with server: got message type \"%c\", length %d\n",
+ id, msgLength);
+ close(conn->sock);
+ conn->sock = -1;
+ conn->status = CONNECTION_BAD; /* No more connection to backend */
+}
+
+/*
+ * Attempt to read an Error or Notice response message.
+ * This is possible in several places, so we break it out as a subroutine.
+ * Entry: 'E' message type and length have already been consumed.
+ * Exit: returns 0 if successfully consumed message.
+ * returns EOF if not enough data.
+ */
+int
+gtmpqGetError(GTM_Conn *conn, GTM_Result *result)
+{
+ char id;
+
+ /*
+ * If we are a GTM proxy, expect an additional proxy header in the incoming
+ * message.
+ */
+ if (conn->is_proxy)
+ {
+ if (gtmpqGetnchar((char *)&result->gr_proxyhdr,
+ sizeof (GTM_ProxyMsgHeader), conn))
+ return 1;
+ result->gr_msglen -= sizeof (GTM_ProxyMsgHeader);
+
+ /*
+ * If the allocated buffer is not large enough to hold the proxied
+ * data, realloc the buffer.
+ *
+ * Since the client side code is shared between the proxy and the
+ * backend, we don't want any memory context management etc here. So
+ * just use plain realloc. Anyways, we don't indent to free the memory.
+ */
+ if (result->gr_proxy_datalen < result->gr_msglen)
+ {
+ result->gr_proxy_data = (char *)realloc(
+ result->gr_proxy_data, result->gr_msglen);
+ result->gr_proxy_datalen = result->gr_msglen;
+ }
+
+ if (gtmpqGetnchar((char *)result->gr_proxy_data,
+ result->gr_msglen, conn))
+ {
+ result->gr_status = 1;
+ return 1;
+ }
+
+ return 0;
+ }
+ else
+ result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+ /*
+ * Read the fields and save into res.
+ */
+ for (;;)
+ {
+ if (gtmpqGetc(&id, conn))
+ goto fail;
+ if (id == '\0')
+ break;
+ if (gtmpqGets(&conn->errorMessage, conn))
+ goto fail;
+ }
+ return 0;
+
+fail:
+ return EOF;
+}
+
+/*
+ * GTMPQgetResult
+ * Get the next GTM_Result produced. Returns NULL if no
+ * query work remains or an error has occurred (e.g. out of
+ * memory).
+ */
+
+GTM_Result *
+GTMPQgetResult(GTM_Conn *conn)
+{
+ GTM_Result *res;
+
+ if (!conn)
+ return NULL;
+
+ /* Parse any available data, if our state permits. */
+ while ((res = pqParseInput(conn)) == NULL)
+ {
+ int flushResult;
+
+ /*
+ * If data remains unsent, send it. Else we might be waiting for the
+ * result of a command the backend hasn't even got yet.
+ */
+ while ((flushResult = gtmpqFlush(conn)) > 0)
+ {
+ if (gtmpqWait(false, true, conn))
+ {
+ flushResult = -1;
+ break;
+ }
+ }
+
+ /* Wait for some more data, and load it. */
+ if (flushResult ||
+ gtmpqWait(true, false, conn) ||
+ gtmpqReadData(conn) < 0)
+ {
+ /*
+ * conn->errorMessage has been set by gtmpqWait or gtmpqReadData.
+ */
+ return NULL;
+ }
+ }
+
+ return res;
+}
+
+static int
+gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
+{
+ int xcnt, xsize;
+ GlobalTransactionId *xip = NULL;
+
+ result->gr_status = 0;
+
+ if (gtmpqGetInt((int *)&result->gr_type, 4, conn))
+ return 1;
+ result->gr_msglen -= 4;
+
+ if (conn->is_proxy)
+ {
+ if (gtmpqGetnchar((char *)&result->gr_proxyhdr,
+ sizeof (GTM_ProxyMsgHeader), conn))
+ return 1;
+ result->gr_msglen -= sizeof (GTM_ProxyMsgHeader);
+ }
+ else
+ result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+ /*
+ * If we are dealing with a proxied message, just read the remaining binary
+ * data which can then be forwarded to the right backend.
+ */
+ if (result->gr_proxyhdr.ph_conid != InvalidGTMProxyConnID)
+ {
+ /*
+ * If the allocated buffer is not large enough to hold the proxied
+ * data, realloc the buffer.
+ *
+ * Since the client side code is shared between the proxy and the
+ * backend, we don't want any memory context management etc here. So
+ * just use plain realloc. Anyways, we don't indent to free the memory.
+ */
+ if (result->gr_proxy_datalen < result->gr_msglen)
+ {
+ result->gr_proxy_data = (char *)realloc(
+ result->gr_proxy_data, result->gr_msglen);
+ result->gr_proxy_datalen = result->gr_msglen;
+ }
+
+ if (gtmpqGetnchar((char *)result->gr_proxy_data,
+ result->gr_msglen, conn))
+ {
+ result->gr_status = 1;
+ return 1;
+ }
+
+ return result->gr_status;
+ }
+
+ result->gr_status = 0;
+
+ switch (result->gr_type)
+ {
+ case TXN_BEGIN_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txnhandle,
+ sizeof (GTM_TransactionHandle), conn))
+ result->gr_status = -1;
+ break;
+
+ case TXN_BEGIN_GETGXID_RESULT:
+ case TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT:
+ case TXN_PREPARE_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid,
+ sizeof (GlobalTransactionId), conn))
+ result->gr_status = -1;
+ break;
+
+ case TXN_COMMIT_RESULT:
+ case TXN_ROLLBACK_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ break;
+
+ case TXN_GET_GXID_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle,
+ sizeof (GTM_TransactionHandle), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid,
+ sizeof (GlobalTransactionId), conn))
+ result->gr_status = -1;
+ break;
+
+ case TXN_BEGIN_GETGXID_MULTI_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.txn_count,
+ sizeof (int), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.start_gxid,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ break;
+
+
+ case TXN_COMMIT_MULTI_RESULT:
+ case TXN_ROLLBACK_MULTI_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_rc_multi.txn_count,
+ sizeof (int), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_rc_multi.status,
+ sizeof (int) * result->gr_resdata.grd_txn_rc_multi.txn_count, conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ break;
+
+ case SNAPSHOT_GXID_GET_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle,
+ sizeof (GTM_TransactionHandle), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ /* Fall through */
+ case SNAPSHOT_GET_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ /* Fall through */
+ case SNAPSHOT_GET_MULTI_RESULT:
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_snap_multi.txn_count,
+ sizeof (int), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_snap_multi.status,
+ sizeof (int) * result->gr_resdata.grd_txn_snap_multi.txn_count, conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmin,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmax,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_recent_global_xmin,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+
+ if (gtmpqGetInt(&result->gr_snapshot.sn_xcnt,
+ sizeof (GlobalTransactionId), conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ xsize = result->gr_xip_size;
+ xcnt = result->gr_snapshot.sn_xcnt;
+ xip = result->gr_snapshot.sn_xip;
+
+ if ((xip == NULL) || (xcnt > xsize))
+ {
+ xip = (GlobalTransactionId *) realloc(xip, sizeof (GlobalTransactionId) * xcnt);
+ result->gr_snapshot.sn_xip = xip;
+ result->gr_xip_size = xcnt;
+ }
+
+ if (gtmpqGetnchar((char *)xip, sizeof (GlobalTransactionId) * xcnt, conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+
+ break;
+
+ case SEQUENCE_INIT_RESULT:
+ case SEQUENCE_RESET_RESULT:
+ case SEQUENCE_CLOSE_RESULT:
+ if (gtmpqReadSeqKey(&result->gr_resdata.grd_seqkey, conn))
+ result->gr_status = -1;
+ break;
+
+ case SEQUENCE_GET_CURRENT_RESULT:
+ case SEQUENCE_GET_NEXT_RESULT:
+ if (gtmpqReadSeqKey(&result->gr_resdata.grd_seq.seqkey, conn))
+ {
+ result->gr_status = -1;
+ break;
+ }
+ if (gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.seqval,
+ sizeof (GTM_Sequence), conn))
+ result->gr_status = -1;
+ break;
+
+ case TXN_GET_STATUS_RESULT:
+ break;
+
+ case TXN_GET_ALL_PREPARED_RESULT:
+ break;
+
+ default:
+ printfGTMPQExpBuffer(&conn->errorMessage,
+ "unexpected result type from server; result typr was \"%d\"\n",
+ result->gr_type);
+ result->gr_status = -1;
+ break;
+ }
+
+ return (result->gr_status);
+}
+
+static int
+gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn)
+{
+ /*
+ * Read keylength
+ */
+ if (gtmpqGetInt(&seqkey->gsk_keylen, 4, conn))
+ return EINVAL;
+
+ /*
+ * Do some sanity checks on the keylength
+ */
+ if (seqkey->gsk_keylen <= 0 || seqkey->gsk_keylen > GTM_MAX_SEQKEY_LENGTH)
+ return EINVAL;
+
+ if ((seqkey->gsk_key = (char *) malloc(seqkey->gsk_keylen)) == NULL)
+ return ENOMEM;
+
+ if (gtmpqGetnchar(seqkey->gsk_key, seqkey->gsk_keylen, conn))
+ return EINVAL;
+
+ return 0;
+}
+
+void
+gtmpqFreeResultData(GTM_Result *result, bool is_proxy)
+{
+ /*
+ * If we are running as a GTM proxy, we don't have anything to do. This may
+ * change though as we add more message types below and some of them may
+ * need cleanup even at the proxy level
+ */
+ if (is_proxy)
+ return;
+
+ switch (result->gr_type)
+ {
+ case SEQUENCE_INIT_RESULT:
+ case SEQUENCE_RESET_RESULT:
+ case SEQUENCE_CLOSE_RESULT:
+ if (result->gr_resdata.grd_seqkey.gsk_key != NULL)
+ free(result->gr_resdata.grd_seqkey.gsk_key);
+ result->gr_resdata.grd_seqkey.gsk_key = NULL;
+ break;
+
+ case SEQUENCE_GET_CURRENT_RESULT:
+ case SEQUENCE_GET_NEXT_RESULT:
+ if (result->gr_resdata.grd_seq.seqkey.gsk_key != NULL)
+ free(result->gr_resdata.grd_seq.seqkey.gsk_key);
+ result->gr_resdata.grd_seqkey.gsk_key = NULL;
+ break;
+
+ case TXN_GET_STATUS_RESULT:
+ break;
+
+ case TXN_GET_ALL_PREPARED_RESULT:
+ break;
+
+ case SNAPSHOT_GET_RESULT:
+ case SNAPSHOT_GXID_GET_RESULT:
+ /*
+ * Lets not free the xip array in the snapshot since we may need it
+ * again shortly
+ */
+ break;
+
+ default:
+ break;
+ }
+}
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
new file mode 100644
index 0000000000..6b22a81c53
--- /dev/null
+++ b/src/gtm/client/gtm_client.c
@@ -0,0 +1,515 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm-client.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+/* Time in seconds to wait for a response from GTM */
+/* We should consider making this a GUC */
+#define CLIENT_GTM_TIMEOUT 20
+
+#include <time.h>
+
+#include "gtm/gtm_c.h"
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+#include "gtm/gtm_client.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/assert.h"
+
+void GTM_FreeResult(GTM_Result *result, bool is_proxy);
+
+/*
+ * Connection Management API
+ */
+GTM_Conn *
+connect_gtm(const char *connect_string)
+{
+ return PQconnectGTM(connect_string);
+}
+
+void
+disconnect_gtm(GTM_Conn *conn)
+{
+ GTMPQfinish(conn);
+}
+
+/*
+ * Transaction Management API
+ */
+GlobalTransactionId
+begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel)
+{
+ bool txn_read_only = false;
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_TXN_BEGIN_GETGXID, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) ||
+ gtmpqPutc(txn_read_only, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ return res->gr_resdata.grd_gxid;
+ else
+ return InvalidGlobalTransactionId;
+
+receive_failed:
+send_failed:
+ return InvalidGlobalTransactionId;
+}
+
+/*
+ * Transaction Management API
+ * Begin a transaction for an autovacuum worker process
+ */
+GlobalTransactionId
+begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel)
+{
+ bool txn_read_only = false;
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) ||
+ gtmpqPutc(txn_read_only, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ return res->gr_resdata.grd_gxid;
+ else
+ return InvalidGlobalTransactionId;
+
+receive_failed:
+send_failed:
+ return InvalidGlobalTransactionId;
+}
+int
+commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_TXN_COMMIT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutc(true, conn) ||
+ gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ {
+ Assert(res->gr_type == TXN_COMMIT_RESULT);
+ Assert(res->gr_resdata.grd_gxid == gxid);
+ }
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+
+}
+
+int
+abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_TXN_ROLLBACK, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutc(true, conn) ||
+ gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ {
+ Assert(res->gr_type == TXN_ROLLBACK_RESULT);
+ Assert(res->gr_resdata.grd_gxid == gxid);
+ }
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+
+}
+
+int
+prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
+ int nodecnt, PGXC_NodeId nodes[])
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_TXN_PREPARE, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutc(true, conn) ||
+ gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) ||
+ gtmpqPutInt(nodecnt, sizeof (int), conn) ||
+ gtmpqPutnchar((char *)nodes, sizeof (PGXC_NodeId) * nodecnt, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ {
+ Assert(res->gr_type == TXN_PREPARE_RESULT);
+ Assert(res->gr_resdata.grd_gxid == gxid);
+ }
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+/*
+ * Snapshot Management API
+ */
+GTM_SnapshotData *
+get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SNAPSHOT_GET, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutc(canbe_grouped, conn) ||
+ gtmpqPutc(true, conn) ||
+ gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ {
+ Assert(res->gr_type == SNAPSHOT_GET_RESULT);
+ Assert(res->gr_resdata.grd_txn.gxid == gxid);
+ return &(res->gr_snapshot);
+ }
+ else
+ return NULL;
+
+
+receive_failed:
+send_failed:
+ return NULL;
+}
+
+/*
+ * Sequence Management API
+ */
+int
+open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
+ GTM_Sequence minval, GTM_Sequence maxval,
+ GTM_Sequence startval, bool cycle)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_INIT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) ||
+ gtmpqPutnchar((char *)&increment, sizeof (GTM_Sequence), conn) ||
+ gtmpqPutnchar((char *)&minval, sizeof (GTM_Sequence), conn) ||
+ gtmpqPutnchar((char *)&maxval, sizeof (GTM_Sequence), conn) ||
+ gtmpqPutnchar((char *)&startval, sizeof (GTM_Sequence), conn) ||
+ gtmpqPutc(cycle, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+int
+close_sequence(GTM_Conn *conn, GTM_SequenceKey key)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_CLOSE, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+GTM_Sequence
+get_current(GTM_Conn *conn, GTM_SequenceKey key)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ return res->gr_resdata.grd_seq.seqval;
+ else
+ return InvalidSequenceValue;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+GTM_Sequence
+get_next(GTM_Conn *conn, GTM_SequenceKey key)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ if (res->gr_status == 0)
+ return res->gr_resdata.grd_seq.seqval;
+ else
+ return InvalidSequenceValue;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+int
+reset_sequence(GTM_Conn *conn, GTM_SequenceKey key)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_RESET, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+ goto send_failed;
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ return -1;
+}
+
+void
+GTM_FreeResult(GTM_Result *result, bool is_proxy)
+{
+ if (result == NULL)
+ return;
+ gtmpqFreeResultData(result, is_proxy);
+ free(result);
+}
diff --git a/src/gtm/client/ip.c b/src/gtm/client/ip.c
new file mode 100644
index 0000000000..b210e201c5
--- /dev/null
+++ b/src/gtm/client/ip.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.c
+ * IPv6-aware network access.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $
+ *
+ * This file and the IPV6 implementation were initially provided by
+ * Nigel Kukard <[email protected]>, Linux Based Systems Design
+ * http://www.lbsd.net.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* This is intended to be used in both frontend and backend, so use c.h */
+#include "gtm/gtm_c.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#include <sys/file.h>
+
+#include "gtm/gtm_ip.h"
+
+
+static int range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+ const struct sockaddr_in * netaddr,
+ const struct sockaddr_in * netmask);
+
+#ifdef HAVE_IPV6
+static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+ const struct sockaddr_in6 * netaddr,
+ const struct sockaddr_in6 * netmask);
+#endif
+
+
+/*
+ * gtm_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
+ */
+int
+gtm_getaddrinfo_all(const char *hostname, const char *servname,
+ const struct addrinfo * hintp, struct addrinfo ** result)
+{
+ int rc;
+
+ /* not all versions of getaddrinfo() zero *result on failure */
+ *result = NULL;
+
+ /* NULL has special meaning to getaddrinfo(). */
+ rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
+ servname, hintp, result);
+
+ return rc;
+}
+
+
+/*
+ * gtm_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix
+ *
+ * Note: the ai_family field of the original hint structure must be passed
+ * so that we can tell whether the addrinfo struct was built by the system's
+ * getaddrinfo() routine or our own getaddrinfo_unix() routine. Some versions
+ * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's
+ * not safe to look at ai_family in the addrinfo itself.
+ */
+void
+gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai)
+{
+ {
+ /* struct was built by getaddrinfo() */
+ if (ai != NULL)
+ freeaddrinfo(ai);
+ }
+}
+
+
+/*
+ * gtm_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets
+ *
+ * The API of this routine differs from the standard getnameinfo() definition
+ * in two ways: first, the addr parameter is declared as sockaddr_storage
+ * rather than struct sockaddr, and second, the node and service fields are
+ * guaranteed to be filled with something even on failure return.
+ */
+int
+gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+ char *node, int nodelen,
+ char *service, int servicelen,
+ int flags)
+{
+ int rc;
+
+ rc = getnameinfo((const struct sockaddr *) addr, salen,
+ node, nodelen,
+ service, servicelen,
+ flags);
+
+ if (rc != 0)
+ {
+ if (node)
+ strlcpy(node, "???", nodelen);
+ if (service)
+ strlcpy(service, "???", servicelen);
+ }
+
+ return rc;
+}
+
+/*
+ * gtm_range_sockaddr - is addr within the subnet specified by netaddr/netmask ?
+ *
+ * Note: caller must already have verified that all three addresses are
+ * in the same address family; and AF_UNIX addresses are not supported.
+ */
+int
+gtm_range_sockaddr(const struct sockaddr_storage * addr,
+ const struct sockaddr_storage * netaddr,
+ const struct sockaddr_storage * netmask)
+{
+ if (addr->ss_family == AF_INET)
+ return range_sockaddr_AF_INET((struct sockaddr_in *) addr,
+ (struct sockaddr_in *) netaddr,
+ (struct sockaddr_in *) netmask);
+#ifdef HAVE_IPV6
+ else if (addr->ss_family == AF_INET6)
+ return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr,
+ (struct sockaddr_in6 *) netaddr,
+ (struct sockaddr_in6 *) netmask);
+#endif
+ else
+ return 0;
+}
+
+static int
+range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+ const struct sockaddr_in * netaddr,
+ const struct sockaddr_in * netmask)
+{
+ if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) &
+ netmask->sin_addr.s_addr) == 0)
+ return 1;
+ else
+ return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+static int
+range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+ const struct sockaddr_in6 * netaddr,
+ const struct sockaddr_in6 * netmask)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) &
+ netmask->sin6_addr.s6_addr[i]) != 0)
+ return 0;
+ }
+
+ return 1;
+}
+#endif /* HAVE_IPV6 */
+
+/*
+ * gtm_sockaddr_cidr_mask - make a network mask of the appropriate family
+ * and required number of significant bits
+ *
+ * The resulting mask is placed in *mask, which had better be big enough.
+ *
+ * Return value is 0 if okay, -1 if not.
+ */
+int
+gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family)
+{
+ long bits;
+ char *endptr;
+
+ bits = strtol(numbits, &endptr, 10);
+
+ if (*numbits == '\0' || *endptr != '\0')
+ return -1;
+
+ switch (family)
+ {
+ case AF_INET:
+ {
+ struct sockaddr_in mask4;
+ long maskl;
+
+ if (bits < 0 || bits > 32)
+ return -1;
+ /* avoid "x << 32", which is not portable */
+ if (bits > 0)
+ maskl = (0xffffffffUL << (32 - (int) bits))
+ & 0xffffffffUL;
+ else
+ maskl = 0;
+ mask4.sin_addr.s_addr = htonl(maskl);
+ memcpy(mask, &mask4, sizeof(mask4));
+ break;
+ }
+
+#ifdef HAVE_IPV6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 mask6;
+ int i;
+
+ if (bits < 0 || bits > 128)
+ return -1;
+ for (i = 0; i < 16; i++)
+ {
+ if (bits <= 0)
+ mask6.sin6_addr.s6_addr[i] = 0;
+ else if (bits >= 8)
+ mask6.sin6_addr.s6_addr[i] = 0xff;
+ else
+ {
+ mask6.sin6_addr.s6_addr[i] =
+ (0xff << (8 - (int) bits)) & 0xff;
+ }
+ bits -= 8;
+ }
+ memcpy(mask, &mask6, sizeof(mask6));
+ break;
+ }
+#endif
+ default:
+ return -1;
+ }
+
+ mask->ss_family = family;
+ return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+/*
+ * gtm_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using
+ * the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result! Note that we only worry about setting the fields
+ * that gtm_range_sockaddr will look at.
+ */
+void
+gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ uint32 ip4addr;
+
+ memcpy(&addr4, addr, sizeof(addr4));
+ ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+ memset(&addr6, 0, sizeof(addr6));
+
+ addr6.sin6_family = AF_INET6;
+
+ addr6.sin6_addr.s6_addr[10] = 0xff;
+ addr6.sin6_addr.s6_addr[11] = 0xff;
+ addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+ addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+ addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+ addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+ memcpy(addr, &addr6, sizeof(addr6));
+}
+
+/*
+ * gtm_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using
+ * the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * This must be different from gtm_promote_v4_to_v6_addr because we want to
+ * set the high-order bits to 1's not 0's.
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result! Note that we only worry about setting the fields
+ * that gtm_range_sockaddr will look at.
+ */
+void
+gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ uint32 ip4addr;
+ int i;
+
+ memcpy(&addr4, addr, sizeof(addr4));
+ ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+ memset(&addr6, 0, sizeof(addr6));
+
+ addr6.sin6_family = AF_INET6;
+
+ for (i = 0; i < 12; i++)
+ addr6.sin6_addr.s6_addr[i] = 0xff;
+
+ addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+ addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+ addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+ addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+ memcpy(addr, &addr6, sizeof(addr6));
+}
+
+#endif /* HAVE_IPV6 */
diff --git a/src/gtm/client/pqexpbuffer.c b/src/gtm/client/pqexpbuffer.c
new file mode 100644
index 0000000000..95c6ee09ee
--- /dev/null
+++ b/src/gtm/client/pqexpbuffer.c
@@ -0,0 +1,373 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqexpbuffer.c
+ *
+ * PQExpBuffer provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data. All storage is allocated with malloc().
+ *
+ * This module is essentially the same as the backend's StringInfo data type,
+ * but it is intended for use in frontend libpq and client applications.
+ * Thus, it does not rely on palloc() nor elog().
+ *
+ * It does rely on vsnprintf(); if configure finds that libc doesn't provide
+ * a usable vsnprintf(), then a copy of our own implementation of it will
+ * be linked into libpq.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.c,v 1.25 2008/11/26 00:26:23 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <limits.h>
+
+#include "gtm/pqexpbuffer.h"
+
+
+/* All "broken" PQExpBuffers point to this string. */
+static const char oom_buffer[1] = "";
+
+
+/*
+ * markPQExpBufferBroken
+ *
+ * Put a PQExpBuffer in "broken" state if it isn't already.
+ */
+static void
+markPQExpBufferBroken(PQExpBuffer str)
+{
+ if (str->data != oom_buffer)
+ free(str->data);
+ /*
+ * Casting away const here is a bit ugly, but it seems preferable to
+ * not marking oom_buffer const. We want to do that to encourage the
+ * compiler to put oom_buffer in read-only storage, so that anyone who
+ * tries to scribble on a broken PQExpBuffer will get a failure.
+ */
+ str->data = (char *) oom_buffer;
+ str->len = 0;
+ str->maxlen = 0;
+}
+
+/*
+ * createGTMPQExpBuffer
+ *
+ * Create an empty 'PQExpBufferData' & return a pointer to it.
+ */
+PQExpBuffer
+createGTMPQExpBuffer(void)
+{
+ PQExpBuffer res;
+
+ res = (PQExpBuffer) malloc(sizeof(PQExpBufferData));
+ if (res != NULL)
+ initGTMPQExpBuffer(res);
+
+ return res;
+}
+
+/*
+ * initGTMPQExpBuffer
+ *
+ * Initialize a PQExpBufferData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+void
+initGTMPQExpBuffer(PQExpBuffer str)
+{
+ str->data = (char *) malloc(INITIAL_EXPBUFFER_SIZE);
+ if (str->data == NULL)
+ {
+ str->data = (char *) oom_buffer; /* see comment above */
+ str->maxlen = 0;
+ str->len = 0;
+ }
+ else
+ {
+ str->maxlen = INITIAL_EXPBUFFER_SIZE;
+ str->len = 0;
+ str->data[0] = '\0';
+ }
+}
+
+/*
+ * destroyGTMPQExpBuffer(str);
+ *
+ * free()s both the data buffer and the PQExpBufferData.
+ * This is the inverse of createGTMPQExpBuffer().
+ */
+void
+destroyGTMPQExpBuffer(PQExpBuffer str)
+{
+ if (str)
+ {
+ termGTMPQExpBuffer(str);
+ free(str);
+ }
+}
+
+/*
+ * termGTMPQExpBuffer(str)
+ * free()s the data buffer but not the PQExpBufferData itself.
+ * This is the inverse of initGTMPQExpBuffer().
+ */
+void
+termGTMPQExpBuffer(PQExpBuffer str)
+{
+ if (str->data != oom_buffer)
+ free(str->data);
+ /* just for luck, make the buffer validly empty. */
+ str->data = (char *) oom_buffer; /* see comment above */
+ str->maxlen = 0;
+ str->len = 0;
+}
+
+/*
+ * resetGTMPQExpBuffer
+ * Reset a PQExpBuffer to empty
+ *
+ * Note: if possible, a "broken" PQExpBuffer is returned to normal.
+ */
+void
+resetGTMPQExpBuffer(PQExpBuffer str)
+{
+ if (str)
+ {
+ if (str->data != oom_buffer)
+ {
+ str->len = 0;
+ str->data[0] = '\0';
+ }
+ else
+ {
+ /* try to reinitialize to valid state */
+ initGTMPQExpBuffer(str);
+ }
+ }
+}
+
+/*
+ * enlargeGTMPQExpBuffer
+ * Make sure there is enough space for 'needed' more bytes in the buffer
+ * ('needed' does not include the terminating null).
+ *
+ * Returns 1 if OK, 0 if failed to enlarge buffer. (In the latter case
+ * the buffer is left in "broken" state.)
+ */
+int
+enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed)
+{
+ size_t newlen;
+ char *newdata;
+
+ if (PQExpBufferBroken(str))
+ return 0; /* already failed */
+
+ /*
+ * Guard against ridiculous "needed" values, which can occur if we're fed
+ * bogus data. Without this, we can get an overflow or infinite loop in
+ * the following.
+ */
+ if (needed >= ((size_t) INT_MAX - str->len))
+ {
+ markPQExpBufferBroken(str);
+ return 0;
+ }
+
+ needed += str->len + 1; /* total space required now */
+
+ /* Because of the above test, we now have needed <= INT_MAX */
+
+ if (needed <= str->maxlen)
+ return 1; /* got enough space already */
+
+ /*
+ * We don't want to allocate just a little more space with each append;
+ * for efficiency, double the buffer size each time it overflows.
+ * Actually, we might need to more than double it if 'needed' is big...
+ */
+ newlen = (str->maxlen > 0) ? (2 * str->maxlen) : 64;
+ while (needed > newlen)
+ newlen = 2 * newlen;
+
+ /*
+ * Clamp to INT_MAX in case we went past it. Note we are assuming here
+ * that INT_MAX <= UINT_MAX/2, else the above loop could overflow. We
+ * will still have newlen >= needed.
+ */
+ if (newlen > (size_t) INT_MAX)
+ newlen = (size_t) INT_MAX;
+
+ newdata = (char *) realloc(str->data, newlen);
+ if (newdata != NULL)
+ {
+ str->data = newdata;
+ str->maxlen = newlen;
+ return 1;
+ }
+
+ markPQExpBufferBroken(str);
+ return 0;
+}
+
+/*
+ * printfGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and insert it into str. More space is allocated to str if necessary.
+ * This is a convenience routine that does the same thing as
+ * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer().
+ */
+void
+printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+{
+ va_list args;
+ size_t avail;
+ int nprinted;
+
+ resetGTMPQExpBuffer(str);
+
+ if (PQExpBufferBroken(str))
+ return; /* already failed */
+
+ for (;;)
+ {
+ /*
+ * Try to format the given string into the available space; but if
+ * there's hardly any space, don't bother trying, just fall through to
+ * enlarge the buffer first.
+ */
+ if (str->maxlen > str->len + 16)
+ {
+ avail = str->maxlen - str->len - 1;
+ va_start(args, fmt);
+ nprinted = vsnprintf(str->data + str->len, avail,
+ fmt, args);
+ va_end(args);
+
+ /*
+ * Note: some versions of vsnprintf return the number of chars
+ * actually stored, but at least one returns -1 on failure. Be
+ * conservative about believing whether the print worked.
+ */
+ if (nprinted >= 0 && nprinted < (int) avail - 1)
+ {
+ /* Success. Note nprinted does not include trailing null. */
+ str->len += nprinted;
+ break;
+ }
+ }
+ /* Double the buffer size and try again. */
+ if (!enlargeGTMPQExpBuffer(str, str->maxlen))
+ return; /* oops, out of memory */
+ }
+}
+
+/*
+ * appendGTMPQExpBuffer
+ *
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and append it to whatever is already in str. More space is allocated
+ * to str if necessary. This is sort of like a combination of sprintf and
+ * strcat.
+ */
+void
+appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+{
+ va_list args;
+ size_t avail;
+ int nprinted;
+
+ if (PQExpBufferBroken(str))
+ return; /* already failed */
+
+ for (;;)
+ {
+ /*
+ * Try to format the given string into the available space; but if
+ * there's hardly any space, don't bother trying, just fall through to
+ * enlarge the buffer first.
+ */
+ if (str->maxlen > str->len + 16)
+ {
+ avail = str->maxlen - str->len - 1;
+ va_start(args, fmt);
+ nprinted = vsnprintf(str->data + str->len, avail,
+ fmt, args);
+ va_end(args);
+
+ /*
+ * Note: some versions of vsnprintf return the number of chars
+ * actually stored, but at least one returns -1 on failure. Be
+ * conservative about believing whether the print worked.
+ */
+ if (nprinted >= 0 && nprinted < (int) avail - 1)
+ {
+ /* Success. Note nprinted does not include trailing null. */
+ str->len += nprinted;
+ break;
+ }
+ }
+ /* Double the buffer size and try again. */
+ if (!enlargeGTMPQExpBuffer(str, str->maxlen))
+ return; /* oops, out of memory */
+ }
+}
+
+/*
+ * appendGTMPQExpBufferStr
+ * Append the given string to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+void
+appendGTMPQExpBufferStr(PQExpBuffer str, const char *data)
+{
+ appendBinaryGTMPQExpBuffer(str, data, strlen(data));
+}
+
+/*
+ * appendGTMPQExpBufferChar
+ * Append a single byte to str.
+ * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster.
+ */
+void
+appendGTMPQExpBufferChar(PQExpBuffer str, char ch)
+{
+ /* Make more room if needed */
+ if (!enlargeGTMPQExpBuffer(str, 1))
+ return;
+
+ /* OK, append the character */
+ str->data[str->len] = ch;
+ str->len++;
+ str->data[str->len] = '\0';
+}
+
+/*
+ * appendBinaryGTMPQExpBuffer
+ *
+ * Append arbitrary binary data to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+void
+appendBinaryGTMPQExpBuffer(PQExpBuffer str, const char *data, size_t datalen)
+{
+ /* Make more room if needed */
+ if (!enlargeGTMPQExpBuffer(str, datalen))
+ return;
+
+ /* OK, append the data */
+ memcpy(str->data + str->len, data, datalen);
+ str->len += datalen;
+
+ /*
+ * Keep a trailing null in place, even though it's probably useless for
+ * binary data...
+ */
+ str->data[str->len] = '\0';
+}
diff --git a/src/gtm/client/strlcpy.c b/src/gtm/client/strlcpy.c
new file mode 100644
index 0000000000..ae031e244c
--- /dev/null
+++ b/src/gtm/client/strlcpy.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * strlcpy.c
+ * strncpy done right
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $
+ *
+ * This file was taken from OpenBSD and is used on platforms that don't
+ * provide strlcpy(). The OpenBSD copyright terms follow.
+ *-------------------------------------------------------------------------
+ */
+
+/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */
+
+/*
+ * Copyright (c) 1998 Todd C. Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "gtm/gtm_c.h"
+
+
+/*
+ * Copy src to string dst of size siz. At most siz-1 characters
+ * will be copied. Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+ char *d = dst;
+ const char *s = src;
+ size_t n = siz;
+
+ /* Copy as many bytes as will fit */
+ if (n != 0)
+ {
+ while (--n != 0)
+ {
+ if ((*d++ = *s++) == '\0')
+ break;
+ }
+ }
+
+ /* Not enough room in dst, add NUL and traverse rest of src */
+ if (n == 0)
+ {
+ if (siz != 0)
+ *d = '\0'; /* NUL-terminate dst */
+ while (*s++)
+ ;
+ }
+
+ return (s - src - 1); /* count does not include NUL */
+}
diff --git a/src/gtm/client/test/Makefile b/src/gtm/client/test/Makefile
new file mode 100644
index 0000000000..46ddbe9a6a
--- /dev/null
+++ b/src/gtm/client/test/Makefile
@@ -0,0 +1,31 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../../../
+include $(top_build_dir)/gtm/Makefile.global
+
+override CPPFLAGS := -I$(top_build_dir)/gtm/client $(CPPFLAGS)
+
+OBJS=test_seq.o test_txn.o test_snap.o test_txnperf.o test_snapperf.o
+LIBS =-lpthread
+LOADLIBES=-lpthread
+CFLAGS=-g -O0
+
+all:test_txn test_seq test_snap test_txnperf test_snapperf
+
+test_txn:test_txn.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_seq:test_seq.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_snap:test_snap.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_txnperf:test_txnperf.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_snapperf:test_snapperf.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+clean:
+ rm -f $(OBJS)
+ rm -f test_txn test_seq test_snap test_txnperf test_snapperf
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/client/test/test_proxy.sh b/src/gtm/client/test/test_proxy.sh
new file mode 100644
index 0000000000..c0d3caec61
--- /dev/null
+++ b/src/gtm/client/test/test_proxy.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+GTM_SERVER_HOSTNAME=gtm
+GTM_SERVER_PORT=16667
+
+GTM_PROXY_HOSTNAMES=(coordinator1 coordinator2 coordinator3 coordinator4 coordinator5)
+GTM_PROXY_PORTS=(16666 16666 16666 16666 16666)
+GTM_PROXY_COUNT=${#GTM_PROXY_HOSTNAMES[*]}
+
+PGXC_BASE=$HOME/pgsql_pgxc
+
+GTM_SERVER_PROCESS=gtm
+GTM_PROXY_PROCESS=gtm_proxy
+GTM_TEST_CLIENT_PROCESS=test_txnperf
+
+GTM_SERVER=$PGXC_BASE/src/gtm/main/$GTM_SERVER_PROCESS
+GTM_PROXY=$PGXC_BASE/src/gtm/proxy/$GTM_PROXY_PROCESS
+GTM_TEST_CLIENT=$PGXC_BASE/src/gtm/client/test/$GTM_TEST_CLIENT_PROCESS
+
+GTM_SERVER_LOG_FILE=/tmp/gtmlog
+GTM_SERVER_CONTROL_FILE=/tmp/gtmcontrol
+GTM_PROXY_LOG_FILE=/tmp/gtmptoxylog
+
+
+if [ "$#" -ne "5" ];
+then
+ echo "Usage: test_proxy.sh <test_gtm_proxy> <num_clients> <num_xacts> <num_stmts> <num_worker_threads>"
+ exit;
+fi
+
+TEST_GTM_PROXY=$1
+NUM_CLIENTS=$2
+NUM_XACTS=$3
+NUM_STMTS=$4
+NUM_THREADS=$5
+
+
+# Stop and kill any gtm server or proxy processes
+#
+ssh $GTM_SERVER_HOSTNAME "killall -9 $GTM_SERVER_PROCESS"
+
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_PROXY_PROCESS" > /dev/null 2>&1
+done
+
+echo "Killed stale server and proxies - sleeping for 5 seconds"
+sleep 5
+
+# Remove any stale log and control files
+#
+ssh $GTM_SERVER_HOSTNAME "rm -f $GTM_SERVER_LOG_FILE $GTM_SERVER_CONTROL_FILE"
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f ${GTM_PROXY_LOG_FILE}_$index"
+done
+
+# Create a output directoty to store all test related data
+#
+OUTPUT_DIR=output
+dir=`date "+%F-%H-%M-%S"`
+echo "Creating output directory $OUTPUT_DIR/$dir"
+mkdir -p $OUTPUT_DIR/$dir
+
+
+# Start the GTM server
+#
+echo "Starting GTM server at $GTM_SERVER_HOSTNAME on port $GTM_SERVER_PORT"
+ssh $GTM_SERVER_HOSTNAME "$GTM_SERVER -h $GTM_SERVER_HOSTNAME -p $GTM_SERVER_PORT -l $GTM_SERVER_LOG_FILE&"&
+
+echo "Sleeping for 3 seconds"
+sleep 3
+
+# Start the GTM proxy on all nodes
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ echo "Starting GTM proxy at ${GTM_PROXY_HOSTNAMES[$index]} on port ${GTM_PROXY_PORTS[$index]} - $NUM_THREADS worker threads"
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_PROXY -h ${GTM_PROXY_HOSTNAMES[$index]} -p ${GTM_PROXY_PORTS[$index]} -s $GTM_SERVER_HOSTNAME -t $GTM_SERVER_PORT -n $NUM_THREADS -l ${GTM_PROXY_LOG_FILE}_$index&"&
+done
+
+echo "Sleeping for 3 seconds"
+sleep 3
+
+# Kill all clients
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_TEST_CLIENT_PROCESS" > /dev/null 2>&1
+done
+
+echo "Killed all stale clients -- sleeping for 5 seconds"
+sleep 5
+
+# Remove any stale result files
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_OUTPUT_$index TEST_OUTPUT_$index.CSV TEST_END_$index"
+done
+
+# Write out some information about the test configuration
+#
+if ( $TEST_GTM_PROXY -eq true );
+then
+ echo "Testing GTM Proxy Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "Number of GTM Proxy Worker Threads $NUM_THREADS" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+else
+ echo "Testing GTM Server Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+ echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+fi
+
+# Start the stats collection scripts . Kill any stale commands and remove the old files first
+#
+ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "vmstat 1 > TEST_VMSTATS_GTM&"&
+
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "vmstat 1 > TEST_VMSTATS_$index&"&
+done
+
+# Start the clients
+#
+rm -f TEST_END*
+
+echo "Starting clients"
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ if ( $TEST_GTM_PROXY -eq true );
+ then
+ SERVER_HOSTNAME=${GTM_PROXY_HOSTNAMES[$index]};
+ SERVER_PORT=${GTM_PROXY_PORTS[$index]};
+ else
+ SERVER_HOSTNAME=$GTM_SERVER_HOSTNAME;
+ SERVER_PORT=$GTM_SERVER_PORT;
+ fi
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_TEST_CLIENT -h $SERVER_HOSTNAME -p $SERVER_PORT -c $NUM_CLIENTS -n $NUM_XACTS -s $NUM_STMTS -i $index &"&
+done
+
+# Wait for all the clients to finish
+#
+while (true)
+do
+ all_done=true
+ for index in ${!GTM_PROXY_HOSTNAMES[*]}
+ do
+ scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_END_$index . > /dev/null 2>&1
+ if ! [ -f TEST_END_$index ];
+ then
+ all_done=false;
+ fi;
+ done
+
+ if ( $all_done -eq true ); then break; fi
+ sleep 5;
+done
+
+echo "All clients finished"
+
+# Copy GTM server log files
+#
+scp $GTM_SERVER_HOSTNAME:$GTM_SERVER_LOG_FILE $OUTPUT_DIR/$dir > /dev/null 2>&1
+
+# Copy GTM server vmstat file
+scp $GTM_SERVER_HOSTNAME:TEST_VMSTATS_GTM $OUTPUT_DIR/$dir > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1
+
+# Copy GTM Proxy log file and the results
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+ scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index.CSV $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+ scp ${GTM_PROXY_HOSTNAMES[$index]}:${GTM_PROXY_LOG_FILE}_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+ scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_VMSTATS_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1
+ ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1
+done
+
+# Paste the result in the summary file
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+ cat $OUTPUT_DIR/$dir/TEST_OUTPUT_$index >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+done
+
+echo "Done"
diff --git a/src/gtm/client/test/test_seq.c b/src/gtm/client/test/test_seq.c
new file mode 100644
index 0000000000..da0ed91ee2
--- /dev/null
+++ b/src/gtm/client/test/test_seq.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x) printf x
+
+int
+main(int argc, char *argv[])
+{
+ int ii;
+ pid_t parent_pid;
+
+ GTM_Conn *conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+ if (conn == NULL)
+ {
+ client_log(("Error in connection"));
+ exit(1);
+ }
+
+ parent_pid = getpid();
+
+ /*
+ * Create sequences
+ */
+ for (ii = 0; ii < 20; ii++)
+ {
+ char buf[100];
+ GTM_SequenceKeyData seqkey;
+ sprintf(buf, "%d:%d", ii, ii);
+ seqkey.gsk_keylen = strlen(buf);
+ seqkey.gsk_key = buf;
+ if (open_sequence(conn, &seqkey, 10, 1, 10000, 100, false))
+ client_log(("Open seq failed\n"));
+ else
+ client_log(("Opened Sequence %s\n", seqkey.gsk_key));
+ }
+
+ /*
+ * Close the GTM connection
+ */
+ GTMPQfinish(conn);
+
+ /*
+ * Start few process which would independently use the sequences
+ */
+ for (ii = 0; ii < 3; ii++)
+ fork();
+
+ /*
+ * Each process now opens a new connection with the GTM
+ */
+ conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+
+ /*
+ * Try to read/increment the sequence
+ */
+ for (ii = 0; ii < 20; ii++)
+ {
+ char buf[100];
+ GTM_SequenceKeyData seqkey;
+ GTM_Sequence seqval;
+ int jj;
+
+ sprintf(buf, "%d:%d", ii, ii);
+ seqkey.gsk_keylen = strlen(buf);
+ seqkey.gsk_key = buf;
+ if ((seqval = get_current(conn, &seqkey)) == InvalidSequenceValue)
+ client_log(("get_current seq failed for sequene %s\n", seqkey.gsk_key));
+ else
+ client_log(("CURRENT SEQVAL(%s): %lld\n", seqkey.gsk_key, seqval));
+
+ for (jj = 0; jj < 5; jj++)
+ {
+ if ((seqval = get_next(conn, &seqkey)) == InvalidSequenceValue)
+ client_log(("get_current seq failed for sequence %s\n", seqkey.gsk_key));
+ else
+ client_log(("NEXT SEQVAL(%s): %lld ", seqkey.gsk_key, seqval));
+ }
+ client_log(("\n"));
+ }
+
+ /*
+ * The main process now closes the sequences. We want to call close only
+ * once, hence this approach
+ */
+ if (getpid() == parent_pid)
+ {
+ /*
+ * Wait long enough so that all other processes are done
+ */
+ sleep(20);
+ for (ii = 0; ii < 20; ii++)
+ {
+ char buf[100];
+ GTM_SequenceKeyData seqkey;
+ sprintf(buf, "%d:%d", ii, ii);
+ seqkey.gsk_keylen = strlen(buf);
+ seqkey.gsk_key = buf;
+ if (close_sequence(conn, &seqkey))
+ client_log(("Close seq failed for sequence %s\n", seqkey.gsk_key));
+ else
+ client_log(("Sequene closed %s\n", seqkey.gsk_key));
+ }
+ }
+ GTMPQfinish(conn);
+ return 0;
+}
diff --git a/src/gtm/client/test/test_snap.c b/src/gtm/client/test/test_snap.c
new file mode 100644
index 0000000000..a2ce2f965a
--- /dev/null
+++ b/src/gtm/client/test/test_snap.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x) printf x
+
+int
+main(int argc, char *argv[])
+{
+ int ii;
+ GlobalTransactionId gxid[4000];
+ GTM_Conn *conn;
+
+ for (ii = 0; ii < 3; ii++)
+ fork();
+
+ conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+ if (conn == NULL)
+ {
+ client_log(("Error in connection\n"));
+ exit(1);
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+ if (gxid[ii] != InvalidGlobalTransactionId)
+ client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("BEGIN transaction failed for ii=%d\n", ii));
+ }
+
+ for (ii = 0; ii < 5; ii++)
+ {
+ int jj;
+ GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true);
+ if (snapshot != NULL)
+ {
+ client_log(("Snapshot: GXID %u, xmin=%u, xmax=%u\n", gxid[ii],
+ snapshot->sn_xmin, snapshot->sn_xmax));
+ client_log(("xcnt=%d %s", snapshot->sn_xcnt,
+ snapshot->sn_xcnt > 0 ? "xip=(" : ""));
+ for (jj = 0; jj < snapshot->sn_xcnt; jj++)
+ client_log(("%d%c ", snapshot->sn_xip[jj],
+ ((jj + 1) == snapshot->sn_xcnt) ? ')' : ','));
+ client_log(("\n"));
+ }
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ PGXC_NodeId nodes[5];
+ nodes[0] = 1;
+ nodes[1] = 1;
+
+ if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+ client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ if (ii % 2 == 0)
+ {
+ if (!abort_transaction(conn, gxid[ii]))
+ client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+ }
+ else
+ {
+ if (!commit_transaction(conn, gxid[ii]))
+ client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+ }
+ }
+
+ GTMPQfinish(conn);
+ return 0;
+}
diff --git a/src/gtm/client/test/test_snapperf.c b/src/gtm/client/test/test_snapperf.c
new file mode 100644
index 0000000000..bc0e511e2b
--- /dev/null
+++ b/src/gtm/client/test/test_snapperf.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x)
+
+int
+main(int argc, char *argv[])
+{
+ int ii;
+ int jj;
+
+#define TXN_COUNT 10000
+#define LOOP_COUNT 10
+
+ GlobalTransactionId gxid[TXN_COUNT];
+ GTM_Conn *conn;
+
+ conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+ if (conn == NULL)
+ {
+ client_log(("Error in connection\n"));
+ exit(1);
+ }
+
+ for (jj = 0; jj < LOOP_COUNT; jj++)
+ {
+ for (ii = 0; ii < TXN_COUNT; ii++)
+ {
+ int kk;
+ GTM_Snapshot snapshot;
+
+ gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+ if (gxid[ii] != InvalidGlobalTransactionId)
+ client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("BEGIN transaction failed for ii=%d\n", ii));
+ snapshot = get_snapshot(conn, gxid[ii], true);
+
+
+ if (ii % 2 == 0)
+ {
+ if (!abort_transaction(conn, gxid[ii]))
+ client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+ }
+ else
+ {
+ if (!commit_transaction(conn, gxid[ii]))
+ client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+ }
+ }
+ }
+
+ GTMPQfinish(conn);
+ return 0;
+}
diff --git a/src/gtm/client/test/test_txn.c b/src/gtm/client/test/test_txn.c
new file mode 100644
index 0000000000..01ed3decbd
--- /dev/null
+++ b/src/gtm/client/test/test_txn.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x) printf x
+
+int
+main(int argc, char *argv[])
+{
+ int ii;
+ GlobalTransactionId gxid[4000];
+ GTM_Conn *conn;
+
+ for (ii = 0; ii < 3; ii++)
+ fork();
+
+ conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+ if (conn == NULL)
+ {
+ client_log(("Error in connection\n"));
+ exit(1);
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ gxid[ii] = begin_transaction(conn, GTM_ISOLATION_SERIALIZABLE);
+ if (gxid[ii] != InvalidGlobalTransactionId)
+ client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("BEGIN transaction failed for ii=%d\n", ii));
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ PGXC_NodeId nodes[5];
+ nodes[0] = 1;
+ nodes[1] = 1;
+
+ if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+ client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+ }
+
+ for (ii = 0; ii < 20; ii++)
+ {
+ if (ii % 2 == 0)
+ {
+ if (!abort_transaction(conn, gxid[ii]))
+ client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+ }
+ else
+ {
+ if (!commit_transaction(conn, gxid[ii]))
+ client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+ }
+ }
+
+ GTMPQfinish(conn);
+ return 0;
+}
diff --git a/src/gtm/client/test/test_txnperf.c b/src/gtm/client/test/test_txnperf.c
new file mode 100644
index 0000000000..174f0a8bab
--- /dev/null
+++ b/src/gtm/client/test/test_txnperf.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+#include <sys/time.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define client_log(x)
+
+extern int optind;
+extern char *optarg;
+
+/* Calculate time difference */
+static void
+diffTime(struct timeval *t1, struct timeval *t2, struct timeval *result)
+{
+ int sec = t1->tv_sec - t2->tv_sec;
+ int usec = t1->tv_usec - t2->tv_usec;
+ if (usec < 0)
+ {
+ usec += 1000000;
+ sec--;
+ }
+ result->tv_sec = sec;
+ result->tv_usec = usec;
+}
+
+/*
+ * Help display should match
+ */
+static void
+help(const char *progname)
+{
+ printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
+ printf(_("Options:\n"));
+ printf(_(" -h hostname GTM proxy/server hostname/IP\n"));
+ printf(_(" -p port GTM proxy/serevr port number\n"));
+ printf(_(" -c count Number of clients\n"));
+ printf(_(" -n count Number of transactions per client\n"));
+ printf(_(" -s count Number of statements per transaction\n"));
+ printf(_(" -i id Coordinator ID\n"));
+}
+
+int
+main(int argc, char *argv[])
+{
+ int ii;
+ int jj;
+ int kk;
+ char connect_string[100];
+ int gtmport;
+ int coordinator_id;
+ int nclients;
+ int ntxns_per_cli;
+ int nstmts_per_txn;
+ char *gtmhost;
+ char opt;
+ struct timeval starttime, endtime, diff;
+ FILE *fp;
+ FILE *fp2;
+ char buf[1024];
+ int testid, this_testid, max_testid;
+ int snapsize = 0;
+ float avg_sanpsize = 0;
+ pid_t child_pids[1024];
+ pid_t parent_pid;
+
+#define TXN_COUNT 1000
+
+ GlobalTransactionId gxid[TXN_COUNT];
+ GTM_Conn *conn;
+ char test_output[256], test_end[256], test_output_csv[256];
+ char system_cmd[1024];
+
+ /*
+ * Catch standard options before doing much else
+ */
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+ {
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ /*
+ * Parse the command like options and set variables
+ */
+ while ((opt = getopt(argc, argv, "h:p:c:n:s:i:")) != -1)
+ {
+ switch (opt)
+ {
+ case 'h':
+ gtmhost = strdup(optarg);
+ break;
+
+ case 'p':
+ gtmport = atoi(optarg);
+ break;
+
+ case 'c':
+ nclients = atoi(optarg);
+ break;
+
+ case 'n':
+ ntxns_per_cli = atoi(optarg);
+ break;
+
+ case 's':
+ nstmts_per_txn = atoi(optarg);
+ break;
+
+ case 'i':
+ coordinator_id = atoi(optarg);
+ sprintf(test_output, "TEST_OUTPUT_%d\0", coordinator_id);
+ sprintf(test_end, "TEST_END_%d\0", coordinator_id);
+ sprintf(test_output_csv, "TEST_OUTPUT_%d.CSV\0", coordinator_id);
+ break;
+
+ default:
+ fprintf(stderr, "Unrecognized option %c\n", opt);
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ sprintf(connect_string, "host=%s port=%d coordinator_id=%d", gtmhost, gtmport, coordinator_id);
+
+ sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output);
+ system(system_cmd);
+ sprintf(system_cmd, "date >> %s", test_output);
+ system(system_cmd);
+ sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output);
+ system(system_cmd);
+
+ fp = fopen(test_output, "a+");
+ fp2 = fopen(test_output_csv, "a+");
+
+ max_testid = 0;
+ while (fgets(buf, 1024, fp) != NULL)
+ {
+ if (sscanf(buf, "TEST-ID: %d", &testid) == 1)
+ {
+ if (max_testid < testid)
+ max_testid = testid;
+ }
+ }
+
+ this_testid = max_testid + 1;
+
+ fprintf(fp, "TEST-ID: %d", this_testid);
+ fprintf(fp, "\n\n");
+ fflush(fp);
+
+ parent_pid = getpid();
+
+ gettimeofday(&starttime, NULL);
+
+ /*
+ * Start as many clients
+ */
+ for (ii = 1; ii < nclients; ii++)
+ {
+ int cpid;
+ if ((cpid = fork()) == 0)
+ break;
+ else
+ child_pids[ii-1] = cpid;
+ }
+
+ if (getpid() == parent_pid)
+ fprintf(stderr, "started %d clients\n", nclients);
+
+ conn = PQconnectGTM(connect_string);
+ if (conn == NULL)
+ {
+ client_log(("Error in connection\n"));
+ exit(1);
+ }
+
+ if (getpid() != parent_pid)
+ gettimeofday(&starttime, NULL);
+
+ snapsize = 0;
+
+ for (jj = 0; jj <= ntxns_per_cli / TXN_COUNT; jj++)
+ {
+ for (ii = 0; ii < TXN_COUNT; ii++)
+ {
+ PGXC_NodeId nodes[5];
+
+ if ((jj * TXN_COUNT) + ii >= ntxns_per_cli)
+ break;
+
+ gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+ if (gxid[ii] != InvalidGlobalTransactionId)
+ client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("BEGIN transaction failed for ii=%d\n", ii));
+
+ for (kk = 0; kk < nstmts_per_txn; kk++)
+ {
+ GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true);
+ snapsize += snapshot->sn_xcnt;
+ }
+
+ nodes[0] = 1;
+ nodes[1] = 1;
+
+ if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+ client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+
+ if (ii % 2 == 0)
+ {
+ if (!abort_transaction(conn, gxid[ii]))
+ client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+ }
+ else
+ {
+ if (!commit_transaction(conn, gxid[ii]))
+ client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+ else
+ client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+ }
+ }
+
+ fprintf(stderr, "client [%d] finished %d transactions\n", getpid(), (jj * TXN_COUNT) + ii);
+ }
+
+ GTMPQfinish(conn);
+
+ if (parent_pid == getpid())
+ {
+ for (ii = 1; ii < nclients; ii++)
+ wait(NULL);
+
+ gettimeofday(&endtime, NULL);
+ diffTime(&endtime, &starttime, &diff);
+ avg_sanpsize = ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn);
+
+ fprintf(fp, "\n");
+ fprintf(fp, "Num of client: %d\n", nclients);
+ fprintf(fp, "Num of txns/client: %d\n", ntxns_per_cli);
+ fprintf(fp, "Num of statements/txn: %d\n", nstmts_per_txn);
+ fprintf(fp, "TPS: %2f\n", (ntxns_per_cli * nclients) / ((float)((diff.tv_sec * 1000000) + diff.tv_usec)/1000000));
+ fprintf(fp, "Total snapshot size: %d\n", snapsize);
+ fprintf(fp, "Average snapshot size: %f\n", avg_sanpsize);
+
+ fprintf(fp, "Time: %d.%d\n", diff.tv_sec, diff.tv_usec);
+ fprintf(fp, "\n");
+
+ sprintf(system_cmd, "touch %s\0", test_end);
+ system(system_cmd);
+ }
+ else
+ {
+ gettimeofday(&endtime, NULL);
+ diffTime(&endtime, &starttime, &diff);
+ avg_sanpsize = ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn);
+ }
+
+ flock(fileno(fp2), LOCK_EX);
+ if (parent_pid != getpid())
+ fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,false\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize);
+ else
+ fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,true\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize);
+
+ flock(fileno(fp2), LOCK_UN);
+ fclose(fp2);
+
+ fclose(fp);
+
+ return 0;
+}
diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile
new file mode 100644
index 0000000000..104382c9c9
--- /dev/null
+++ b/src/gtm/common/Makefile
@@ -0,0 +1,25 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=gtm
+
+
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+
+OBJS=aset.o mcxt.o elog.o assert.o stringinfo.o gtm_lock.o gtm_list.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+ rm -f $(OBJS)
+ rm -f libgtm.so libgtm.so.1 libgtm.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/common/aset.c b/src/gtm/common/aset.c
new file mode 100644
index 0000000000..aa9533009a
--- /dev/null
+++ b/src/gtm/common/aset.c
@@ -0,0 +1,1261 @@
+/*-------------------------------------------------------------------------
+ *
+ * aset.c
+ * Allocation set definitions.
+ *
+ * AllocSet is our standard implementation of the abstract MemoryContext
+ * type.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/utils/mmgr/aset.c,v 1.77 2008/04/11 22:54:23 tgl Exp $
+ *
+ * NOTE:
+ * This is a new (Feb. 05, 1999) implementation of the allocation set
+ * routines. AllocSet...() does not use OrderedSet...() any more.
+ * Instead it manages allocations in a block pool by itself, combining
+ * many small allocations in a few bigger blocks. AllocSetFree() normally
+ * doesn't free() memory really. It just add's the free'd area to some
+ * list for later reuse by AllocSetAlloc(). All memory blocks are free()'d
+ * at once on AllocSetReset(), which happens when the memory context gets
+ * destroyed.
+ * Jan Wieck
+ *
+ * Performance improvement from Tom Lane, 8/99: for extremely large request
+ * sizes, we do want to be able to give the memory back to free() as soon
+ * as it is pfree()'d. Otherwise we risk tying up a lot of memory in
+ * freelist entries that might never be usable. This is specially needed
+ * when the caller is repeatedly repalloc()'ing a block bigger and bigger;
+ * the previous instances of the block were guaranteed to be wasted until
+ * AllocSetReset() under the old way.
+ *
+ * Further improvement 12/00: as the code stood, request sizes in the
+ * midrange between "small" and "large" were handled very inefficiently,
+ * because any sufficiently large free chunk would be used to satisfy a
+ * request, even if it was much larger than necessary. This led to more
+ * and more wasted space in allocated chunks over time. To fix, get rid
+ * of the midrange behavior: we now handle only "small" power-of-2-size
+ * chunks as chunks. Anything "large" is passed off to malloc(). Change
+ * the number of freelists to change the small/large boundary.
+ *
+ *
+ * About CLOBBER_FREED_MEMORY:
+ *
+ * If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ * This is useful for catching places that reference already-freed memory.
+ *
+ * About MEMORY_CONTEXT_CHECKING:
+ *
+ * Since we usually round request sizes up to the next power of 2, there
+ * is often some unused space immediately after a requested data area.
+ * Thus, if someone makes the common error of writing past what they've
+ * requested, the problem is likely to go unnoticed ... until the day when
+ * there *isn't* any wasted space, perhaps because of different memory
+ * alignment on a new platform, or some other effect. To catch this sort
+ * of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ * the requested space whenever the request is less than the actual chunk
+ * size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+/* Define this to detail debug alloc information */
+/* #define HAVE_ALLOCINFO */
+
+/*--------------------
+ * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS),
+ * for k = 0 .. ALLOCSET_NUM_FREELISTS-1.
+ *
+ * Note that all chunks in the freelists have power-of-2 sizes. This
+ * improves recyclability: we may waste some space, but the wasted space
+ * should stay pretty constant as requests are made and released.
+ *
+ * A request too large for the last freelist is handled by allocating a
+ * dedicated block from malloc(). The block still has a block header and
+ * chunk header, but when the chunk is freed we'll return the whole block
+ * to malloc(), not put it on our freelists.
+ *
+ * CAUTION: ALLOC_MINBITS must be large enough so that
+ * 1<<ALLOC_MINBITS is at least MAXALIGN,
+ * or we may fail to align the smallest chunks adequately.
+ * 8-byte alignment is enough on all currently known machines.
+ *
+ * With the current parameters, request sizes up to 8K are treated as chunks,
+ * larger requests go into dedicated blocks. Change ALLOCSET_NUM_FREELISTS
+ * to adjust the boundary point.
+ *--------------------
+ */
+
+#define ALLOC_MINBITS 3 /* smallest chunk size is 8 bytes */
+#define ALLOCSET_NUM_FREELISTS 11
+#define ALLOC_CHUNK_LIMIT (1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS))
+/* Size of largest chunk that we use a fixed size for */
+
+/*--------------------
+ * The first block allocated for an allocset has size initBlockSize.
+ * Each time we have to allocate another block, we double the block size
+ * (if possible, and without exceeding maxBlockSize), so as to reduce
+ * the bookkeeping load on malloc().
+ *
+ * Blocks allocated to hold oversize chunks do not follow this rule, however;
+ * they are just however big they need to be to hold that single chunk.
+ *--------------------
+ */
+
+#define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData))
+#define ALLOC_CHUNKHDRSZ MAXALIGN(sizeof(AllocChunkData))
+
+typedef struct AllocBlockData *AllocBlock; /* forward reference */
+typedef struct AllocChunkData *AllocChunk;
+
+/*
+ * AllocPointer
+ * Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *AllocPointer;
+
+/*
+ * AllocSetContext is our standard implementation of MemoryContext.
+ *
+ * Note: isReset means there is nothing for AllocSetReset to do. This is
+ * different from the aset being physically empty (empty blocks list) because
+ * we may still have a keeper block. It's also different from the set being
+ * logically empty, because we don't attempt to detect pfree'ing the last
+ * active chunk.
+ */
+typedef struct AllocSetContext
+{
+ MemoryContextData header; /* Standard memory-context fields */
+ /* Info about storage allocated in this context: */
+ AllocBlock blocks; /* head of list of blocks in this set */
+ AllocChunk freelist[ALLOCSET_NUM_FREELISTS]; /* free chunk lists */
+ bool isReset; /* T = no space alloced since last reset */
+ /* Allocation parameters for this context: */
+ Size initBlockSize; /* initial block size */
+ Size maxBlockSize; /* maximum block size */
+ Size nextBlockSize; /* next block size to allocate */
+ Size allocChunkLimit; /* effective chunk size limit */
+ AllocBlock keeper; /* if not NULL, keep this block over resets */
+} AllocSetContext;
+
+typedef AllocSetContext *AllocSet;
+
+/*
+ * AllocBlock
+ * An AllocBlock is the unit of memory that is obtained by aset.c
+ * from malloc(). It contains one or more AllocChunks, which are
+ * the units requested by palloc() and freed by pfree(). AllocChunks
+ * cannot be returned to malloc() individually, instead they are put
+ * on freelists by pfree() and re-used by the next palloc() that has
+ * a matching request size.
+ *
+ * AllocBlockData is the header data for a block --- the usable space
+ * within the block begins at the next alignment boundary.
+ */
+typedef struct AllocBlockData
+{
+ AllocSet aset; /* aset that owns this block */
+ AllocBlock next; /* next block in aset's blocks list */
+ char *freeptr; /* start of free space in this block */
+ char *endptr; /* end of space in this block */
+} AllocBlockData;
+
+/*
+ * AllocChunk
+ * The prefix of each piece of memory in an AllocBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ */
+typedef struct AllocChunkData
+{
+ /* aset is the owning aset if allocated, or the freelist link if free */
+ void *aset;
+ /* size is always the size of the usable space in the chunk */
+ Size size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* when debugging memory usage, also store actual requested size */
+ /* this is zero in a free chunk */
+ Size requested_size;
+#endif
+} AllocChunkData;
+
+/*
+ * AllocPointerIsValid
+ * True iff pointer is valid allocation pointer.
+ */
+#define AllocPointerIsValid(pointer) PointerIsValid(pointer)
+
+/*
+ * AllocSetIsValid
+ * True iff set is valid allocation set.
+ */
+#define AllocSetIsValid(set) PointerIsValid(set)
+
+#define AllocPointerGetChunk(ptr) \
+ ((AllocChunk)(((char *)(ptr)) - ALLOC_CHUNKHDRSZ))
+#define AllocChunkGetPointer(chk) \
+ ((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for AllocSet contexts.
+ */
+static void *AllocSetAlloc(MemoryContext context, Size size);
+static void AllocSetFree(MemoryContext context, void *pointer);
+static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size);
+static void AllocSetInit(MemoryContext context);
+static void AllocSetReset(MemoryContext context);
+static void AllocSetDelete(MemoryContext context);
+static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer);
+static bool AllocSetIsEmpty(MemoryContext context);
+static void AllocSetStats(MemoryContext context, int level);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void AllocSetCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for AllocSet contexts.
+ */
+static MemoryContextMethods AllocSetMethods = {
+ AllocSetAlloc,
+ AllocSetFree,
+ AllocSetRealloc,
+ AllocSetInit,
+ AllocSetReset,
+ AllocSetDelete,
+ AllocSetGetChunkSpace,
+ AllocSetIsEmpty,
+ AllocSetStats
+#ifdef MEMORY_CONTEXT_CHECKING
+ ,AllocSetCheck
+#endif
+};
+
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define AllocFreeInfo(_cxt, _chunk) \
+ fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+ (_cxt)->header.name, (_chunk), (_chunk)->size)
+#define AllocAllocInfo(_cxt, _chunk) \
+ fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+ (_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define AllocFreeInfo(_cxt, _chunk)
+#define AllocAllocInfo(_cxt, _chunk)
+#endif
+
+/* ----------
+ * AllocSetFreeIndex -
+ *
+ * Depending on the size of an allocation compute which freechunk
+ * list of the alloc set it belongs to. Caller must have verified
+ * that size <= ALLOC_CHUNK_LIMIT.
+ * ----------
+ */
+static inline int
+AllocSetFreeIndex(Size size)
+{
+ int idx = 0;
+
+ if (size > 0)
+ {
+ size = (size - 1) >> ALLOC_MINBITS;
+ while (size != 0)
+ {
+ idx++;
+ size >>= 1;
+ }
+ Assert(idx < ALLOCSET_NUM_FREELISTS);
+ }
+
+ return idx;
+}
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data. It's not really
+ * very random, just a repeating sequence with a length that's prime. What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ */
+static void
+randomize_mem(char *ptr, size_t size)
+{
+ static int save_ctr = 1;
+ int ctr;
+
+ ctr = save_ctr;
+ while (size-- > 0)
+ {
+ *ptr++ = ctr;
+ if (++ctr > 251)
+ ctr = 1;
+ }
+ save_ctr = ctr;
+}
+
+#endif /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * AllocSetContextCreate
+ * Create a new AllocSet context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * minContextSize: minimum context size
+ * initBlockSize: initial allocation block size
+ * maxBlockSize: maximum allocation block size
+ */
+MemoryContext
+AllocSetContextCreate(MemoryContext parent,
+ const char *name,
+ Size minContextSize,
+ Size initBlockSize,
+ Size maxBlockSize,
+ bool isShared)
+{
+ AllocSet context;
+
+ /* Do the type-independent part of context creation */
+ context = (AllocSet) MemoryContextCreate(sizeof(AllocSetContext),
+ &AllocSetMethods,
+ parent,
+ name);
+
+ /*
+ * Make sure alloc parameters are reasonable, and save them.
+ *
+ * We somewhat arbitrarily enforce a minimum 1K block size.
+ */
+ initBlockSize = MAXALIGN(initBlockSize);
+ if (initBlockSize < 1024)
+ initBlockSize = 1024;
+ maxBlockSize = MAXALIGN(maxBlockSize);
+ if (maxBlockSize < initBlockSize)
+ maxBlockSize = initBlockSize;
+ context->initBlockSize = initBlockSize;
+ context->maxBlockSize = maxBlockSize;
+ context->nextBlockSize = initBlockSize;
+
+ /*
+ * Compute the allocation chunk size limit for this context. It can't be
+ * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists.
+ * If maxBlockSize is small then requests exceeding the maxBlockSize
+ * should be treated as large chunks, too. We have to have
+ * allocChunkLimit a power of two, because the requested and
+ * actually-allocated sizes of any chunk must be on the same side of the
+ * limit, else we get confused about whether the chunk is "big".
+ */
+ context->allocChunkLimit = ALLOC_CHUNK_LIMIT;
+ while (context->allocChunkLimit >
+ (Size) (maxBlockSize - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ))
+ context->allocChunkLimit >>= 1;
+
+ /*
+ * Grab always-allocated space, if requested
+ */
+ if (minContextSize > ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)
+ {
+ Size blksize = MAXALIGN(minContextSize);
+ AllocBlock block;
+
+ block = (AllocBlock) malloc(blksize);
+ if (block == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("out of memory"),
+ errdetail("Failed while creating memory context \"%s\".",
+ name)));
+ }
+ block->aset = context;
+ block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ block->endptr = ((char *) block) + blksize;
+ block->next = context->blocks;
+ context->blocks = block;
+ /* Mark block as not to be released at reset time */
+ context->keeper = block;
+ }
+
+ context->isReset = true;
+ context->header.is_shared = isShared;
+ if (isShared)
+ GTM_RWLockInit(&context->header.lock);
+
+ return (MemoryContext) context;
+}
+
+/*
+ * AllocSetInit
+ * Context-type-specific initialization routine.
+ *
+ * This is called by MemoryContextCreate() after setting up the
+ * generic MemoryContext fields and before linking the new context
+ * into the context tree. We must do whatever is needed to make the
+ * new context minimally valid for deletion. We must *not* risk
+ * failure --- thus, for example, allocating more memory is not cool.
+ * (AllocSetContextCreate can allocate memory when it gets control
+ * back, however.)
+ */
+static void
+AllocSetInit(MemoryContext context)
+{
+ /*
+ * Since MemoryContextCreate already zeroed the context node, we don't
+ * have to do anything here: it's already OK.
+ */
+}
+
+/*
+ * AllocSetReset
+ * Frees all memory which is allocated in the given set.
+ *
+ * Actually, this routine has some discretion about what to do.
+ * It should mark all allocated chunks freed, but it need not necessarily
+ * give back all the resources the set owns. Our actual implementation is
+ * that we hang onto any "keeper" block specified for the set. In this way,
+ * we don't thrash malloc() when a context is repeatedly reset after small
+ * allocations, which is typical behavior for per-tuple contexts.
+ */
+static void
+AllocSetReset(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block;
+
+ AssertArg(AllocSetIsValid(set));
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+ /* Nothing to do if no pallocs since startup or last reset */
+ if (set->isReset)
+ {
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return;
+ }
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ AllocSetCheck(context);
+#endif
+
+ /* Clear chunk freelists */
+ MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+ block = set->blocks;
+
+ /* New blocks list is either empty or just the keeper block */
+ set->blocks = set->keeper;
+
+ while (block != NULL)
+ {
+ AllocBlock next = block->next;
+
+ if (block == set->keeper)
+ {
+ /* Reset the block, but don't return it to malloc */
+ char *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ;
+
+#ifdef CLOBBER_FREED_MEMORY
+ /* Wipe freed memory for debugging purposes */
+ memset(datastart, 0x7F, block->freeptr - datastart);
+#endif
+ block->freeptr = datastart;
+ block->next = NULL;
+ }
+ else
+ {
+ /* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+ /* Wipe freed memory for debugging purposes */
+ memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+ free(block);
+ }
+ block = next;
+ }
+
+ /* Reset block size allocation sequence, too */
+ set->nextBlockSize = set->initBlockSize;
+
+ set->isReset = true;
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetDelete
+ * Frees all memory which is allocated in the given set,
+ * in preparation for deletion of the set.
+ *
+ * Unlike AllocSetReset, this *must* free all resources of the set.
+ * But note we are not responsible for deleting the context node itself.
+ */
+static void
+AllocSetDelete(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block = set->blocks;
+
+ AssertArg(AllocSetIsValid(set));
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ AllocSetCheck(context);
+#endif
+
+ /* Make it look empty, just in case... */
+ MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+ set->blocks = NULL;
+ set->keeper = NULL;
+
+ while (block != NULL)
+ {
+ AllocBlock next = block->next;
+
+#ifdef CLOBBER_FREED_MEMORY
+ /* Wipe freed memory for debugging purposes */
+ memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+ free(block);
+ block = next;
+ }
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetAlloc
+ * Returns pointer to allocated memory of given size; memory is added
+ * to the set.
+ */
+static void *
+AllocSetAlloc(MemoryContext context, Size size)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block;
+ AllocChunk chunk;
+ int fidx;
+ Size chunk_size;
+ Size blksize;
+
+ AssertArg(AllocSetIsValid(set));
+
+ /*
+ * If this is a shared context, make it thread safe by acquiring
+ * appropriate lock
+ */
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+ /*
+ * If requested size exceeds maximum for chunks, allocate an entire block
+ * for this request.
+ */
+ if (size > set->allocChunkLimit)
+ {
+ chunk_size = MAXALIGN(size);
+ blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ block = (AllocBlock) malloc(blksize);
+ if (block == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %lu.",
+ (unsigned long) size)));
+ }
+ block->aset = set;
+ block->freeptr = block->endptr = ((char *) block) + blksize;
+
+ chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+ chunk->aset = set;
+ chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk_size)
+ ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ /*
+ * Stick the new block underneath the active allocation block, so that
+ * we don't lose the use of the space remaining therein.
+ */
+ if (set->blocks != NULL)
+ {
+ block->next = set->blocks->next;
+ set->blocks->next = block;
+ }
+ else
+ {
+ block->next = NULL;
+ set->blocks = block;
+ }
+
+ set->isReset = false;
+
+ AllocAllocInfo(set, chunk);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return AllocChunkGetPointer(chunk);
+ }
+
+ /*
+ * Request is small enough to be treated as a chunk. Look in the
+ * corresponding free list to see if there is a free chunk we could reuse.
+ * If one is found, remove it from the free list, make it again a member
+ * of the alloc set and return its data address.
+ */
+ fidx = AllocSetFreeIndex(size);
+ chunk = set->freelist[fidx];
+ if (chunk != NULL)
+ {
+ Assert(chunk->size >= size);
+
+ set->freelist[fidx] = (AllocChunk) chunk->aset;
+
+ chunk->aset = (void *) set;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ /* isReset must be false already */
+ Assert(!set->isReset);
+
+ AllocAllocInfo(set, chunk);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return AllocChunkGetPointer(chunk);
+ }
+
+ /*
+ * Choose the actual chunk size to allocate.
+ */
+ chunk_size = (1 << ALLOC_MINBITS) << fidx;
+ Assert(chunk_size >= size);
+
+ /*
+ * If there is enough room in the active allocation block, we will put the
+ * chunk into that block. Else must start a new one.
+ */
+ if ((block = set->blocks) != NULL)
+ {
+ Size availspace = block->endptr - block->freeptr;
+
+ if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ))
+ {
+ /*
+ * The existing active (top) block does not have enough room for
+ * the requested allocation, but it might still have a useful
+ * amount of space in it. Once we push it down in the block list,
+ * we'll never try to allocate more space from it. So, before we
+ * do that, carve up its free space into chunks that we can put on
+ * the set's freelists.
+ *
+ * Because we can only get here when there's less than
+ * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
+ * more than ALLOCSET_NUM_FREELISTS-1 times.
+ */
+ while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
+ {
+ Size availchunk = availspace - ALLOC_CHUNKHDRSZ;
+ int a_fidx = AllocSetFreeIndex(availchunk);
+
+ /*
+ * In most cases, we'll get back the index of the next larger
+ * freelist than the one we need to put this chunk on. The
+ * exception is when availchunk is exactly a power of 2.
+ */
+ if (availchunk != (1 << (a_fidx + ALLOC_MINBITS)))
+ {
+ a_fidx--;
+ Assert(a_fidx >= 0);
+ availchunk = (1 << (a_fidx + ALLOC_MINBITS));
+ }
+
+ chunk = (AllocChunk) (block->freeptr);
+
+ block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
+ availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
+
+ chunk->size = availchunk;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = 0; /* mark it free */
+#endif
+ chunk->aset = (void *) set->freelist[a_fidx];
+ set->freelist[a_fidx] = chunk;
+ }
+
+ /* Mark that we need to create a new block */
+ block = NULL;
+ }
+ }
+
+ /*
+ * Time to create a new regular (multi-chunk) block?
+ */
+ if (block == NULL)
+ {
+ Size required_size;
+
+ /*
+ * The first such block has size initBlockSize, and we double the
+ * space in each succeeding block, but not more than maxBlockSize.
+ */
+ blksize = set->nextBlockSize;
+ set->nextBlockSize <<= 1;
+ if (set->nextBlockSize > set->maxBlockSize)
+ set->nextBlockSize = set->maxBlockSize;
+
+ /*
+ * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
+ * space... but try to keep it a power of 2.
+ */
+ required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ while (blksize < required_size)
+ blksize <<= 1;
+
+ /* Try to allocate it */
+ block = (AllocBlock) malloc(blksize);
+
+ /*
+ * We could be asking for pretty big blocks here, so cope if malloc
+ * fails. But give up if there's less than a meg or so available...
+ */
+ while (block == NULL && blksize > 1024 * 1024)
+ {
+ blksize >>= 1;
+ if (blksize < required_size)
+ break;
+ block = (AllocBlock) malloc(blksize);
+ }
+
+ if (block == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %lu.",
+ (unsigned long) size)));
+ }
+
+ block->aset = set;
+ block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ block->endptr = ((char *) block) + blksize;
+
+ /*
+ * If this is the first block of the set, make it the "keeper" block.
+ * Formerly, a keeper block could only be created during context
+ * creation, but allowing it to happen here lets us have fast reset
+ * cycling even for contexts created with minContextSize = 0; that way
+ * we don't have to force space to be allocated in contexts that might
+ * never need any space. Don't mark an oversize block as a keeper,
+ * however.
+ */
+ if (set->keeper == NULL && blksize == set->initBlockSize)
+ set->keeper = block;
+
+ block->next = set->blocks;
+ set->blocks = block;
+ }
+
+ /*
+ * OK, do the allocation
+ */
+ chunk = (AllocChunk) (block->freeptr);
+
+ block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+ Assert(block->freeptr <= block->endptr);
+
+ chunk->aset = (void *) set;
+ chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ set->isReset = false;
+
+ AllocAllocInfo(set, chunk);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return AllocChunkGetPointer(chunk);
+}
+
+/*
+ * AllocSetFree
+ * Frees allocated memory; memory is removed from the set.
+ */
+static void
+AllocSetFree(MemoryContext context, void *pointer)
+{
+ AllocSet set = (AllocSet) context;
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+
+ /*
+ * Acquire appropriate lock for a shared memory context
+ */
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+ AllocFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < chunk->size)
+ if (((char *) pointer)[chunk->requested_size] != 0x7E)
+ elog(WARNING, "detected write past chunk end in %s %p",
+ set->header.name, chunk);
+#endif
+
+ if (chunk->size > set->allocChunkLimit)
+ {
+ /*
+ * Big chunks are certain to have been allocated as single-chunk
+ * blocks. Find the containing block and return it to malloc().
+ */
+ AllocBlock block = set->blocks;
+ AllocBlock prevblock = NULL;
+
+ while (block != NULL)
+ {
+ if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ))
+ break;
+ prevblock = block;
+ block = block->next;
+ }
+ if (block == NULL)
+ {
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ elog(ERROR, "could not find block containing chunk %p", chunk);
+ }
+ /* let's just make sure chunk is the only one in the block */
+ Assert(block->freeptr == ((char *) block) +
+ (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ));
+
+ /* OK, remove block from aset's list and free it */
+ if (prevblock == NULL)
+ set->blocks = block->next;
+ else
+ prevblock->next = block->next;
+#ifdef CLOBBER_FREED_MEMORY
+ /* Wipe freed memory for debugging purposes */
+ memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+ free(block);
+ }
+ else
+ {
+ /* Normal case, put the chunk into appropriate freelist */
+ int fidx = AllocSetFreeIndex(chunk->size);
+
+ chunk->aset = (void *) set->freelist[fidx];
+
+#ifdef CLOBBER_FREED_MEMORY
+ /* Wipe freed memory for debugging purposes */
+ memset(pointer, 0x7F, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Reset requested_size to 0 in chunks that are on freelist */
+ chunk->requested_size = 0;
+#endif
+ set->freelist[fidx] = chunk;
+ }
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetRealloc
+ * Returns new pointer to allocated memory of given size; this memory
+ * is added to the set. Memory associated with given pointer is copied
+ * into the new memory, and the old memory is freed.
+ */
+static void *
+AllocSetRealloc(MemoryContext context, void *pointer, Size size)
+{
+ AllocSet set = (AllocSet) context;
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+ Size oldsize = chunk->size;
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < oldsize)
+ if (((char *) pointer)[chunk->requested_size] != 0x7E)
+ elog(WARNING, "detected write past chunk end in %s %p",
+ set->header.name, chunk);
+#endif
+
+ /* isReset must be false already */
+ Assert(!set->isReset);
+
+ /*
+ * Chunk sizes are aligned to power of 2 in AllocSetAlloc(). Maybe the
+ * allocated area already is >= the new size. (In particular, we always
+ * fall out here if the requested size is a decrease.)
+ */
+ if (oldsize >= size)
+ {
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* We can only fill the extra space if we know the prior request */
+ if (size > chunk->requested_size)
+ randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size,
+ size - chunk->requested_size);
+#endif
+
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < oldsize)
+ ((char *) pointer)[size] = 0x7E;
+#endif
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return pointer;
+ }
+
+ if (oldsize > set->allocChunkLimit)
+ {
+ /*
+ * The chunk must have been allocated as a single-chunk block. Find
+ * the containing block and use realloc() to make it bigger with
+ * minimum space wastage.
+ */
+ AllocBlock block = set->blocks;
+ AllocBlock prevblock = NULL;
+ Size chksize;
+ Size blksize;
+
+ while (block != NULL)
+ {
+ if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ))
+ break;
+ prevblock = block;
+ block = block->next;
+ }
+ if (block == NULL)
+ {
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ elog(ERROR, "could not find block containing chunk %p", chunk);
+ }
+ /* let's just make sure chunk is the only one in the block */
+ Assert(block->freeptr == ((char *) block) +
+ (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ));
+
+ /* Do the realloc */
+ chksize = MAXALIGN(size);
+ blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ block = (AllocBlock) realloc(block, blksize);
+ if (block == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %lu.",
+ (unsigned long) size)));
+ }
+ block->freeptr = block->endptr = ((char *) block) + blksize;
+
+ /* Update pointers since block has likely been moved */
+ chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+ if (prevblock == NULL)
+ set->blocks = block;
+ else
+ prevblock->next = block;
+ chunk->size = chksize;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* We can only fill the extra space if we know the prior request */
+ randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size,
+ size - chunk->requested_size);
+#endif
+
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return AllocChunkGetPointer(chunk);
+ }
+ else
+ {
+ /*
+ * Small-chunk case. We just do this by brute force, ie, allocate a
+ * new chunk and copy the data. Since we know the existing data isn't
+ * huge, this won't involve any great memcpy expense, so it's not
+ * worth being smarter. (At one time we tried to avoid memcpy when it
+ * was possible to enlarge the chunk in-place, but that turns out to
+ * misbehave unpleasantly for repeated cycles of
+ * palloc/repalloc/pfree: the eventually freed chunks go into the
+ * wrong freelist for the next initial palloc request, and so we leak
+ * memory indefinitely. See pgsql-hackers archives for 2007-08-11.)
+ */
+ AllocPointer newPointer;
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ /* allocate new chunk */
+ newPointer = AllocSetAlloc((MemoryContext) set, size);
+
+ /* transfer existing data (certain to fit) */
+ memcpy(newPointer, pointer, oldsize);
+
+ /* free old chunk */
+ AllocSetFree((MemoryContext) set, pointer);
+
+ return newPointer;
+ }
+}
+
+/*
+ * AllocSetGetChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ */
+static Size
+AllocSetGetChunkSpace(MemoryContext context, void *pointer)
+{
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+
+ return chunk->size + ALLOC_CHUNKHDRSZ;
+}
+
+/*
+ * AllocSetIsEmpty
+ * Is an allocset empty of any allocated space?
+ */
+static bool
+AllocSetIsEmpty(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ bool ret = false;
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+ /*
+ * For now, we say "empty" only if the context is new or just reset. We
+ * could examine the freelists to determine if all space has been freed,
+ * but it's not really worth the trouble for present uses of this
+ * functionality.
+ */
+ if (set->isReset)
+ ret = true;
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+ return ret;
+}
+
+/*
+ * AllocSetStats
+ * Displays stats about memory consumption of an allocset.
+ */
+static void
+AllocSetStats(MemoryContext context, int level)
+{
+ AllocSet set = (AllocSet) context;
+ long nblocks = 0;
+ long nchunks = 0;
+ long totalspace = 0;
+ long freespace = 0;
+ AllocBlock block;
+ AllocChunk chunk;
+ int fidx;
+ int i;
+
+ /*
+ * XXX The caller is most likely holding a lock for shared contextes. So
+ * don't bother to lock it again (this might cause problem some time, so
+ * revisit this later)
+ */
+ for (block = set->blocks; block != NULL; block = block->next)
+ {
+ nblocks++;
+ totalspace += block->endptr - ((char *) block);
+ freespace += block->endptr - block->freeptr;
+ }
+ for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++)
+ {
+ for (chunk = set->freelist[fidx]; chunk != NULL;
+ chunk = (AllocChunk) chunk->aset)
+ {
+ nchunks++;
+ freespace += chunk->size + ALLOC_CHUNKHDRSZ;
+ }
+ }
+
+ for (i = 0; i < level; i++)
+ fprintf(stderr, " ");
+
+ fprintf(stderr,
+ "%s: %lu total in %ld blocks; %lu free (%ld chunks); %lu used\n",
+ set->header.name, totalspace, nblocks, freespace, nchunks,
+ totalspace - freespace);
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * AllocSetCheck
+ * Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+AllocSetCheck(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ char *name = set->header.name;
+ AllocBlock block;
+
+ /*
+ * XXX The caller is most likely holding a lock for shared contextes. So
+ * don't bother to lock it again (this might cause problem some time, so
+ * revisit this later)
+ */
+ for (block = set->blocks; block != NULL; block = block->next)
+ {
+ char *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ long blk_used = block->freeptr - bpoz;
+ long blk_data = 0;
+ long nchunks = 0;
+
+ /*
+ * Empty block - empty can be keeper-block only
+ */
+ if (!blk_used)
+ {
+ if (set->keeper != block)
+ elog(WARNING, "problem in alloc set %s: empty block %p",
+ name, block);
+ }
+
+ /*
+ * Chunk walker
+ */
+ while (bpoz < block->freeptr)
+ {
+ AllocChunk chunk = (AllocChunk) bpoz;
+ Size chsize,
+ dsize;
+ char *chdata_end;
+
+ chsize = chunk->size; /* aligned chunk size */
+ dsize = chunk->requested_size; /* real data */
+ chdata_end = ((char *) chunk) + (ALLOC_CHUNKHDRSZ + dsize);
+
+ /*
+ * Check chunk size
+ */
+ if (dsize > chsize)
+ elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p",
+ name, chunk, block);
+ if (chsize < (1 << ALLOC_MINBITS))
+ elog(WARNING, "problem in alloc set %s: bad size %lu for chunk %p in block %p",
+ name, (unsigned long) chsize, chunk, block);
+
+ /* single-chunk block? */
+ if (chsize > set->allocChunkLimit &&
+ chsize + ALLOC_CHUNKHDRSZ != blk_used)
+ elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p",
+ name, chunk, block);
+
+ /*
+ * If chunk is allocated, check for correct aset pointer. (If it's
+ * free, the aset is the freelist pointer, which we can't check as
+ * easily...)
+ */
+ if (dsize > 0 && chunk->aset != (void *) set)
+ elog(WARNING, "problem in alloc set %s: bogus aset link in block %p, chunk %p",
+ name, block, chunk);
+
+ /*
+ * Check for overwrite of "unallocated" space in chunk
+ */
+ if (dsize > 0 && dsize < chsize && *chdata_end != 0x7E)
+ elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p",
+ name, block, chunk);
+
+ blk_data += chsize;
+ nchunks++;
+
+ bpoz += ALLOC_CHUNKHDRSZ + chsize;
+ }
+
+ if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used)
+ elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p",
+ name, block);
+ }
+}
+
+#endif /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/gtm/common/assert.c b/src/gtm/common/assert.c
new file mode 100644
index 0000000000..58b94481b3
--- /dev/null
+++ b/src/gtm/common/assert.c
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * assert.c
+ * Assert code.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/utils/error/assert.c,v 1.35 2008/01/01 19:45:53 momjian Exp $
+ *
+ * NOTE
+ * This should eventually work with elog()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/assert.h"
+
+#include <unistd.h>
+
+bool assert_enabled = false;
+
+/*
+ * ExceptionalCondition - Handles the failure of an Assert()
+ *
+ * Note: this can't actually return, but we declare it as returning int
+ * because the TrapMacro() macro might get wonky otherwise.
+ */
+int
+ExceptionalCondition(const char *conditionName,
+ const char *errorType,
+ const char *fileName,
+ int lineNumber)
+{
+ if (!PointerIsValid(conditionName)
+ || !PointerIsValid(fileName)
+ || !PointerIsValid(errorType))
+ fprintf(stderr, "TRAP: ExceptionalCondition: bad arguments\n");
+ else
+ {
+ fprintf(stderr, "TRAP: %s(\"%s\", File: \"%s\", Line: %d)\n",
+ errorType, conditionName,
+ fileName, lineNumber);
+ }
+
+ /* Usually this shouldn't be needed, but make sure the msg went out */
+ fflush(stderr);
+
+ abort();
+ return 0;
+}
diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
new file mode 100644
index 0000000000..626dc36925
--- /dev/null
+++ b/src/gtm/common/elog.c
@@ -0,0 +1,1117 @@
+/*-------------------------------------------------------------------------
+ *
+ * elog.c
+ * error logging and reporting
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.212 2009/01/19 15:34:23 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <fcntl.h>
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <ctype.h>
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/stringinfo.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_ext.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+
+#undef _
+#define _(x) x
+
+/*
+ * Change this to something which is more appropriate.
+ *
+ * XXX The GTM should take command like argument to set the log file
+ */
+char *GTMLogFile = NULL;
+
+/* GUC parameters */
+int Log_destination = LOG_DESTINATION_STDERR;
+
+/* Macro for checking errordata_stack_depth is reasonable */
+#define CHECK_STACK_DEPTH() \
+ do { \
+ if (errordata_stack_depth < 0) \
+ { \
+ errordata_stack_depth = -1; \
+ ereport(ERROR, (errmsg_internal("errstart was not called"))); \
+ } \
+ } while (0)
+
+
+static void send_message_to_server_log(ErrorData *edata);
+static void send_message_to_frontend(Port *myport, ErrorData *edata);
+static char *expand_fmt_string(const char *fmt, ErrorData *edata);
+static const char *useful_strerror(int errnum);
+static const char *error_severity(int elevel);
+static void append_with_tabs(StringInfo buf, const char *str);
+static bool is_log_level_output(int elevel, int log_min_level);
+
+int log_min_messages = WARNING;
+char *Log_line_prefix = "%l:%p:%m -"; /* format for extra log line info */
+
+#define FORMATTED_TS_LEN 128
+static char formatted_start_time[FORMATTED_TS_LEN];
+static char formatted_log_time[FORMATTED_TS_LEN];
+
+static void log_line_prefix(StringInfo buf);
+static void setup_formatted_log_time(void);
+/*
+ * setup formatted_log_time, for consistent times between CSV and regular logs
+ */
+static void
+setup_formatted_log_time(void)
+{
+ struct timeval tv;
+ time_t stamp_time;
+ char msbuf[8];
+
+ gettimeofday(&tv, NULL);
+ stamp_time = (time_t) tv.tv_sec;
+
+ strftime(formatted_log_time, FORMATTED_TS_LEN,
+ /* leave room for milliseconds... */
+ "%Y-%m-%d %H:%M:%S %Z",
+ localtime(&stamp_time));
+
+ /* 'paste' milliseconds into place... */
+ sprintf(msbuf, ".%03d", (int) (tv.tv_usec / 1000));
+ strncpy(formatted_log_time + 19, msbuf, 4);
+}
+
+/*
+ * Format tag info for log lines; append to the provided buffer.
+ */
+static void
+log_line_prefix(StringInfo buf)
+{
+ /* static counter for line numbers */
+ static long log_line_number = 0;
+
+ /* has counter been reset in current process? */
+ static int log_my_pid = 0;
+
+ int format_len;
+ int i;
+
+ /*
+ * This is one of the few places where we'd rather not inherit a static
+ * variable's value from the postmaster. But since we will, reset it when
+ * MyProcPid changes. MyStartTime also changes when MyProcPid does, so
+ * reset the formatted start timestamp too.
+ */
+ if (log_my_pid != MyThreadID)
+ {
+ log_line_number = 0;
+ log_my_pid = MyThreadID;
+ formatted_start_time[0] = '\0';
+ }
+ log_line_number++;
+
+ if (Log_line_prefix == NULL)
+ return; /* in case guc hasn't run yet */
+
+ format_len = strlen(Log_line_prefix);
+
+ for (i = 0; i < format_len; i++)
+ {
+ if (Log_line_prefix[i] != '%')
+ {
+ /* literal char, just copy */
+ appendStringInfoChar(buf, Log_line_prefix[i]);
+ continue;
+ }
+ /* go to char after '%' */
+ i++;
+ if (i >= format_len)
+ break; /* format error - ignore it */
+
+ /* process the option */
+ switch (Log_line_prefix[i])
+ {
+ case 'p':
+ appendStringInfo(buf, "%lu", MyThreadID);
+ break;
+ case 'l':
+ appendStringInfo(buf, "%ld", log_line_number);
+ break;
+ case 'm':
+ setup_formatted_log_time();
+ appendStringInfoString(buf, formatted_log_time);
+ break;
+ default:
+ /* format error - ignore it */
+ break;
+ }
+ }
+}
+
+/*
+ * errstart --- begin an error-reporting cycle
+ *
+ * Create a stack entry and store the given parameters in it. Subsequently,
+ * errmsg() and perhaps other routines will be called to further populate
+ * the stack entry. Finally, errfinish() will be called to actually process
+ * the error report.
+ *
+ * Returns TRUE in normal case. Returns FALSE to short-circuit the error
+ * report (if it's a warning or lower and not to be reported anywhere).
+ */
+bool
+errstart(int elevel, const char *filename, int lineno,
+ const char *funcname, const char *domain)
+{
+ ErrorData *edata;
+ bool output_to_server;
+ bool output_to_client = false;
+ int i;
+
+ /*
+ * Check some cases in which we want to promote an error into a more
+ * severe error. None of this logic applies for non-error messages.
+ */
+ if (elevel >= ERROR)
+ {
+ /*
+ * If we are inside a critical section, all errors become PANIC
+ * errors. See miscadmin.h.
+ */
+ if (CritSectionCount > 0)
+ elevel = PANIC;
+
+ /*
+ * Check reasons for treating ERROR as FATAL:
+ *
+ * 1. we have no handler to pass the error to (implies we are in the
+ * postmaster or in backend startup).
+ *
+ * 2. ExitOnAnyError mode switch is set (initdb uses this).
+ *
+ * 3. the error occurred after proc_exit has begun to run. (It's
+ * proc_exit's responsibility to see that this doesn't turn into
+ * infinite recursion!)
+ */
+ if (elevel == ERROR)
+ {
+ if (PG_exception_stack == NULL)
+ elevel = FATAL;
+ }
+
+ /*
+ * If the error level is ERROR or more, errfinish is not going to
+ * return to caller; therefore, if there is any stacked error already
+ * in progress it will be lost. This is more or less okay, except we
+ * do not want to have a FATAL or PANIC error downgraded because the
+ * reporting process was interrupted by a lower-grade error. So check
+ * the stack and make sure we panic if panic is warranted.
+ */
+ for (i = 0; i <= errordata_stack_depth; i++)
+ elevel = Max(elevel, errordata[i].elevel);
+ }
+
+ output_to_server = is_log_level_output(elevel, log_min_messages);
+ output_to_client = (elevel >= ERROR);
+
+ /* Skip processing effort if non-error message will not be output */
+ if (elevel < ERROR && !output_to_server && !output_to_client)
+ return false;
+
+ /*
+ * Okay, crank up a stack entry to store the info in.
+ */
+
+ if (recursion_depth++ > 0 && elevel >= ERROR)
+ {
+ /*
+ * Ooops, error during error processing. Clear ErrorContext as
+ * discussed at top of file. We will not return to the original
+ * error's reporter or handler, so we don't need it.
+ */
+ MemoryContextReset(ErrorContext);
+ }
+
+ if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE)
+ {
+ /*
+ * Wups, stack not big enough. We treat this as a PANIC condition
+ * because it suggests an infinite loop of errors during error
+ * recovery.
+ */
+ errordata_stack_depth = -1; /* make room on stack */
+ ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded")));
+ }
+ /* Initialize data for this error frame */
+ edata = &errordata[errordata_stack_depth];
+ MemSet(edata, 0, sizeof(ErrorData));
+ edata->elevel = elevel;
+ edata->output_to_server = output_to_server;
+ edata->output_to_client = output_to_client;
+ edata->filename = filename;
+ edata->lineno = lineno;
+ edata->funcname = funcname;
+ /* errno is saved here so that error parameter eval can't change it */
+ edata->saved_errno = errno;
+
+ recursion_depth--;
+ return true;
+}
+
+/*
+ * errfinish --- end an error-reporting cycle
+ *
+ * Produce the appropriate error report(s) and pop the error stack.
+ *
+ * If elevel is ERROR or worse, control does not return to the caller.
+ * See elog.h for the error level definitions.
+ */
+void
+errfinish(int dummy,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ int elevel = edata->elevel;
+
+ MemoryContext oldcontext;
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+
+ /*
+ * Do processing in ErrorContext, which we hope has enough reserved space
+ * to report an error.
+ */
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+
+ /*
+ * If ERROR (not more nor less) we pass it off to the current handler.
+ * Printing it and popping the stack is the responsibility of the handler.
+ */
+ if (elevel == ERROR)
+ {
+ /*
+ * We do some minimal cleanup before longjmp'ing so that handlers can
+ * execute in a reasonably sane state.
+ */
+ CritSectionCount = 0; /* should be unnecessary, but... */
+
+ /*
+ * Note that we leave CurrentMemoryContext set to ErrorContext. The
+ * handler should reset it to something else soon.
+ */
+
+ recursion_depth--;
+ PG_RE_THROW();
+ }
+
+ /* Emit the message to the right places */
+ EmitErrorReport(MyPort);
+
+ /* Now free up subsidiary data attached to stack entry, and release it */
+ if (edata->message)
+ pfree(edata->message);
+ if (edata->detail)
+ pfree(edata->detail);
+ if (edata->detail_log)
+ pfree(edata->detail_log);
+ if (edata->hint)
+ pfree(edata->hint);
+ if (edata->context)
+ pfree(edata->context);
+ errordata_stack_depth--;
+
+ /* Exit error-handling context */
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+
+ /*
+ * Perform error recovery action as specified by elevel.
+ */
+ if (elevel == FATAL)
+ {
+ /*
+ * fflush here is just to improve the odds that we get to see the
+ * error message, in case things are so hosed that proc_exit crashes.
+ * Any other code you might be tempted to add here should probably be
+ * in an on_proc_exit or on_shmem_exit callback instead.
+ */
+ fflush(stdout);
+ fflush(stderr);
+
+ /*
+ * Do normal process-exit cleanup, then return exit code 1 to indicate
+ * FATAL termination. The postmaster may or may not consider this
+ * worthy of panic, depending on which subprocess returns it.
+ */
+ pthread_exit(NULL);
+ }
+
+ if (elevel >= PANIC)
+ {
+ fflush(stdout);
+ fflush(stderr);
+ abort();
+ }
+
+ /*
+ * We reach here if elevel <= WARNING. OK to return to caller.
+ */
+}
+
+/*
+ * This macro handles expansion of a format string and associated parameters;
+ * it's common code for errmsg(), errdetail(), etc. Must be called inside
+ * a routine that is declared like "const char *fmt, ..." and has an edata
+ * pointer set up. The message is assigned to edata->targetfield, or
+ * appended to it if appendval is true. The message is subject to translation
+ * if translateit is true.
+ *
+ * Note: we pstrdup the buffer rather than just transferring its storage
+ * to the edata field because the buffer might be considerably larger than
+ * really necessary.
+ */
+#define EVALUATE_MESSAGE(targetfield, appendval, translateit) \
+ { \
+ char *fmtbuf; \
+ StringInfoData buf; \
+ /* Expand %m in format string */ \
+ fmtbuf = expand_fmt_string(fmt, edata); \
+ initStringInfo(&buf); \
+ if ((appendval) && edata->targetfield) \
+ appendStringInfo(&buf, "%s\n", edata->targetfield); \
+ /* Generate actual output --- have to use appendStringInfoVA */ \
+ for (;;) \
+ { \
+ va_list args; \
+ bool success; \
+ va_start(args, fmt); \
+ success = appendStringInfoVA(&buf, fmtbuf, args); \
+ va_end(args); \
+ if (success) \
+ break; \
+ enlargeStringInfo(&buf, buf.maxlen); \
+ } \
+ /* Done with expanded fmt */ \
+ pfree(fmtbuf); \
+ /* Save the completed message into the stack item */ \
+ if (edata->targetfield) \
+ pfree(edata->targetfield); \
+ edata->targetfield = pstrdup(buf.data); \
+ pfree(buf.data); \
+ }
+
+
+/*
+ * errmsg --- add a primary error message text to the current error
+ *
+ * In addition to the usual %-escapes recognized by printf, "%m" in
+ * fmt is replaced by the error message for the caller's value of errno.
+ *
+ * Note: no newline is needed at the end of the fmt string, since
+ * ereport will provide one for the output methods that need it.
+ */
+int
+errmsg(const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(message, false, true);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+ return 0; /* return value does not matter */
+}
+
+
+/*
+ * errmsg_internal --- add a primary error message text to the current error
+ *
+ * This is exactly like errmsg() except that strings passed to errmsg_internal
+ * are not translated, and are customarily left out of the
+ * internationalization message dictionary. This should be used for "can't
+ * happen" cases that are probably not worth spending translation effort on.
+ * We also use this for certain cases where we *must* not try to translate
+ * the message because the translation would fail and result in infinite
+ * error recursion.
+ */
+int
+errmsg_internal(const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(message, false, false);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+ return 0; /* return value does not matter */
+}
+
+
+/*
+ * errdetail --- add a detail error message text to the current error
+ */
+int
+errdetail(const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(detail, false, true);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+ return 0; /* return value does not matter */
+}
+
+
+/*
+ * errdetail_log --- add a detail_log error message text to the current error
+ */
+int
+errdetail_log(const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(detail_log, false, true);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+ return 0; /* return value does not matter */
+}
+
+
+/*
+ * errhint --- add a hint error message text to the current error
+ */
+int
+errhint(const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(hint, false, true);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+ return 0; /* return value does not matter */
+}
+
+
+
+/*
+ * errfunction --- add reporting function name to the current error
+ *
+ * This is used when backwards compatibility demands that the function
+ * name appear in messages sent to old-protocol clients. Note that the
+ * passed string is expected to be a non-freeable constant string.
+ */
+int
+errfunction(const char *funcname)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+
+
+ edata->funcname = funcname;
+ edata->show_funcname = true;
+
+ return 0; /* return value does not matter */
+}
+
+
+/*
+ * elog_start --- startup for old-style API
+ *
+ * All that we do here is stash the hidden filename/lineno/funcname
+ * arguments into a stack entry.
+ *
+ * We need this to be separate from elog_finish because there's no other
+ * portable way to deal with inserting extra arguments into the elog call.
+ * (If macros with variable numbers of arguments were portable, it'd be
+ * easy, but they aren't.)
+ */
+void
+elog_start(const char *filename, int lineno, const char *funcname)
+{
+ ErrorData *edata;
+
+ if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE)
+ {
+ /*
+ * Wups, stack not big enough. We treat this as a PANIC condition
+ * because it suggests an infinite loop of errors during error
+ * recovery. Note that the message is intentionally not localized,
+ * else failure to convert it to client encoding could cause further
+ * recursion.
+ */
+ errordata_stack_depth = -1; /* make room on stack */
+ ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded")));
+ }
+
+ edata = &errordata[errordata_stack_depth];
+ edata->filename = filename;
+ edata->lineno = lineno;
+ edata->funcname = funcname;
+ /* errno is saved now so that error parameter eval can't change it */
+ edata->saved_errno = errno;
+}
+
+/*
+ * elog_finish --- finish up for old-style API
+ */
+void
+elog_finish(int elevel, const char *fmt,...)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ MemoryContext oldcontext;
+
+ CHECK_STACK_DEPTH();
+
+ /*
+ * Do errstart() to see if we actually want to report the message.
+ */
+ errordata_stack_depth--;
+ errno = edata->saved_errno;
+ if (!errstart(elevel, edata->filename, edata->lineno, edata->funcname, NULL))
+ return; /* nothing to do */
+
+ /*
+ * Format error message just like errmsg_internal().
+ */
+ recursion_depth++;
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ EVALUATE_MESSAGE(message, false, false);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+
+ /*
+ * And let errfinish() finish up.
+ */
+ errfinish(0);
+}
+
+/*
+ * Actual output of the top-of-stack error message
+ *
+ * In the ereport(ERROR) case this is called from GTM_ThreadMain(or not at all,
+ * if the error is caught by somebody). For all other severity levels this
+ * is called by errfinish.
+ */
+void
+EmitErrorReport(void *argp)
+{
+ ErrorData *edata = &errordata[errordata_stack_depth];
+ Port *myport= (Port *)argp;
+ MemoryContext oldcontext;
+
+ recursion_depth++;
+ CHECK_STACK_DEPTH();
+ oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+ /* Send to server log, if enabled */
+ if (edata->output_to_server)
+ send_message_to_server_log(edata);
+
+ /* Send to client, if enabled */
+ if ((edata->output_to_client) && (myport != NULL))
+ send_message_to_frontend(myport, edata);
+
+ MemoryContextSwitchTo(oldcontext);
+ recursion_depth--;
+}
+
+/*
+ * FlushErrorState --- flush the error state after error recovery
+ *
+ * This should be called by an error handler after it's done processing
+ * the error; or as soon as it's done CopyErrorData, if it intends to
+ * do stuff that is likely to provoke another error. You are not "out" of
+ * the error subsystem until you have done this.
+ */
+void
+FlushErrorState(void)
+{
+ /*
+ * Reset stack to empty. The only case where it would be more than one
+ * deep is if we serviced an error that interrupted construction of
+ * another message. We assume control escaped out of that message
+ * construction and won't ever go back.
+ */
+ errordata_stack_depth = -1;
+ recursion_depth = 0;
+ /* Delete all data in ErrorContext */
+ MemoryContextResetAndDeleteChildren(ErrorContext);
+}
+
+
+
+/*
+ * pg_re_throw --- out-of-line implementation of PG_RE_THROW() macro
+ */
+void
+pg_re_throw(void)
+{
+ /* If possible, throw the error to the next outer setjmp handler */
+ if (PG_exception_stack != NULL)
+ siglongjmp(*PG_exception_stack, 1);
+ else
+ {
+ /*
+ * If we get here, elog(ERROR) was thrown inside a PG_TRY block, which
+ * we have now exited only to discover that there is no outer setjmp
+ * handler to pass the error to. Had the error been thrown outside
+ * the block to begin with, we'd have promoted the error to FATAL, so
+ * the correct behavior is to make it FATAL now; that is, emit it and
+ * then call proc_exit.
+ */
+ ErrorData *edata = &errordata[errordata_stack_depth];
+
+ Assert(errordata_stack_depth >= 0);
+ Assert(edata->elevel == ERROR);
+ edata->elevel = FATAL;
+
+ /*
+ * At least in principle, the increase in severity could have changed
+ * where-to-output decisions, so recalculate. This should stay in
+ * sync with errstart(), which see for comments.
+ */
+ edata->output_to_server = is_log_level_output(FATAL,
+ log_min_messages);
+ edata->output_to_client = true;
+ errfinish(0);
+ }
+
+ /* We mustn't return... */
+ ExceptionalCondition("pg_re_throw tried to return", "FailedAssertion",
+ __FILE__, __LINE__);
+
+ /*
+ * Since ExceptionalCondition isn't declared noreturn because of
+ * TrapMacro(), we need this to keep gcc from complaining.
+ */
+ abort();
+}
+
+
+/*
+ * Initialization of error output file
+ */
+void
+DebugFileOpen(void)
+{
+ int fd,
+ istty;
+
+ if (GTMLogFile[0])
+ {
+ /*
+ * A debug-output file name was given.
+ *
+ * Make sure we can write the file, and find out if it's a tty.
+ */
+ if ((fd = open(GTMLogFile, O_CREAT | O_APPEND | O_WRONLY,
+ 0666)) < 0)
+ ereport(FATAL,
+ (errno,
+ errmsg("could not open file \"%s\": %m", GTMLogFile)));
+ istty = isatty(fd);
+ close(fd);
+
+ /*
+ * Redirect our stderr to the debug output file.
+ */
+ if (!freopen(GTMLogFile, "a", stderr))
+ ereport(FATAL,
+ (errno,
+ errmsg("could not reopen file \"%s\" as stderr: %m",
+ GTMLogFile)));
+
+ /*
+ * If the file is a tty and we're running under the postmaster, try to
+ * send stdout there as well (if it isn't a tty then stderr will block
+ * out stdout, so we may as well let stdout go wherever it was going
+ * before).
+ */
+ if (istty)
+ if (!freopen(GTMLogFile, "a", stdout))
+ ereport(FATAL,
+ (errno,
+ errmsg("could not reopen file \"%s\" as stdout: %m",
+ GTMLogFile)));
+ }
+}
+
+/*
+ * Write error report to server's log
+ */
+static void
+send_message_to_server_log(ErrorData *edata)
+{
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+
+ formatted_log_time[0] = '\0';
+
+ log_line_prefix(&buf);
+ appendStringInfo(&buf, "%s: ", error_severity(edata->elevel));
+
+ if (edata->message)
+ append_with_tabs(&buf, edata->message);
+ else
+ append_with_tabs(&buf, _("missing error text"));
+
+ appendStringInfoChar(&buf, '\n');
+
+ if (edata->detail_log)
+ {
+ log_line_prefix(&buf);
+ appendStringInfoString(&buf, _("DETAIL: "));
+ append_with_tabs(&buf, edata->detail_log);
+ appendStringInfoChar(&buf, '\n');
+ }
+ else if (edata->detail)
+ {
+ log_line_prefix(&buf);
+ appendStringInfoString(&buf, _("DETAIL: "));
+ append_with_tabs(&buf, edata->detail);
+ appendStringInfoChar(&buf, '\n');
+ }
+ if (edata->hint)
+ {
+ log_line_prefix(&buf);
+ appendStringInfoString(&buf, _("HINT: "));
+ append_with_tabs(&buf, edata->hint);
+ appendStringInfoChar(&buf, '\n');
+ }
+ if (edata->context)
+ {
+ log_line_prefix(&buf);
+ appendStringInfoString(&buf, _("CONTEXT: "));
+ append_with_tabs(&buf, edata->context);
+ appendStringInfoChar(&buf, '\n');
+ }
+
+ /* assume no newlines in funcname or filename... */
+ if (edata->funcname && edata->filename)
+ {
+ appendStringInfo(&buf, _("LOCATION: %s, %s:%d\n"),
+ edata->funcname, edata->filename,
+ edata->lineno);
+ }
+ else if (edata->filename)
+ {
+ appendStringInfo(&buf, _("LOCATION: %s:%d\n"),
+ edata->filename, edata->lineno);
+ }
+
+ /* Write to stderr, if enabled */
+ if (Log_destination & LOG_DESTINATION_STDERR)
+ write(fileno(stderr), buf.data, buf.len);
+
+ pfree(buf.data);
+}
+
+/*
+ * Write error report to client
+ */
+static void
+send_message_to_frontend(Port *myport, ErrorData *edata)
+{
+ StringInfoData msgbuf;
+
+ /* 'N' (Notice) is for nonfatal conditions, 'E' is for errors */
+ pq_beginmessage(&msgbuf, (edata->elevel < ERROR) ? 'N' : 'E');
+
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+
+ proxyhdr.ph_conid = myport->conn_id;
+ /* Send the GTM Proxy header if we are dealing with a proxy */
+ pq_sendbytes(&msgbuf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+
+ pq_sendbyte(&msgbuf, PG_DIAG_SEVERITY);
+ pq_sendstring(&msgbuf, error_severity(edata->elevel));
+
+ /* M field is required per protocol, so always send something */
+ pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_PRIMARY);
+ if (edata->message)
+ pq_sendstring(&msgbuf, edata->message);
+ else
+ pq_sendstring(&msgbuf, _("missing error text"));
+
+ if (edata->detail)
+ {
+ pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_DETAIL);
+ pq_sendstring(&msgbuf, edata->detail);
+ }
+
+ /* detail_log is intentionally not used here */
+
+ if (edata->hint)
+ {
+ pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_HINT);
+ pq_sendstring(&msgbuf, edata->hint);
+ }
+
+ pq_sendbyte(&msgbuf, '\0'); /* terminator */
+
+ pq_endmessage(myport, &msgbuf);
+
+ /*
+ * This flush is normally not necessary, since postgres.c will flush out
+ * waiting data when control returns to the main loop. But it seems best
+ * to leave it here, so that the client has some clue what happened if the
+ * backend dies before getting back to the main loop ... error/notice
+ * messages should not be a performance-critical path anyway, so an extra
+ * flush won't hurt much ...
+ */
+ pq_flush(myport);
+}
+
+/*
+ * Support routines for formatting error messages.
+ */
+
+
+/*
+ * expand_fmt_string --- process special format codes in a format string
+ *
+ * We must replace %m with the appropriate strerror string, since vsnprintf
+ * won't know what to do with it.
+ *
+ * The result is a palloc'd string.
+ */
+static char *
+expand_fmt_string(const char *fmt, ErrorData *edata)
+{
+ StringInfoData buf;
+ const char *cp;
+
+ initStringInfo(&buf);
+
+ for (cp = fmt; *cp; cp++)
+ {
+ if (cp[0] == '%' && cp[1] != '\0')
+ {
+ cp++;
+ if (*cp == 'm')
+ {
+ /*
+ * Replace %m by system error string. If there are any %'s in
+ * the string, we'd better double them so that vsnprintf won't
+ * misinterpret.
+ */
+ const char *cp2;
+
+ cp2 = useful_strerror(edata->saved_errno);
+ for (; *cp2; cp2++)
+ {
+ if (*cp2 == '%')
+ appendStringInfoCharMacro(&buf, '%');
+ appendStringInfoCharMacro(&buf, *cp2);
+ }
+ }
+ else
+ {
+ /* copy % and next char --- this avoids trouble with %%m */
+ appendStringInfoCharMacro(&buf, '%');
+ appendStringInfoCharMacro(&buf, *cp);
+ }
+ }
+ else
+ appendStringInfoCharMacro(&buf, *cp);
+ }
+
+ return buf.data;
+}
+
+
+/*
+ * A slightly cleaned-up version of strerror()
+ */
+static const char *
+useful_strerror(int errnum)
+{
+ /* this buffer is only used if errno has a bogus value */
+ static char errorstr_buf[48];
+ const char *str;
+
+ str = strerror(errnum);
+
+ /*
+ * Some strerror()s return an empty string for out-of-range errno. This is
+ * ANSI C spec compliant, but not exactly useful.
+ */
+ if (str == NULL || *str == '\0')
+ {
+ snprintf(errorstr_buf, sizeof(errorstr_buf),
+ /*------
+ translator: This string will be truncated at 47
+ characters expanded. */
+ _("operating system error %d"), errnum);
+ str = errorstr_buf;
+ }
+
+ return str;
+}
+
+
+/*
+ * error_severity --- get localized string representing elevel
+ */
+static const char *
+error_severity(int elevel)
+{
+ const char *prefix;
+
+ switch (elevel)
+ {
+ case DEBUG1:
+ case DEBUG2:
+ case DEBUG3:
+ case DEBUG4:
+ case DEBUG5:
+ prefix = _("DEBUG");
+ break;
+ case LOG:
+ case COMMERROR:
+ prefix = _("LOG");
+ break;
+ case INFO:
+ prefix = _("INFO");
+ break;
+ case NOTICE:
+ prefix = _("NOTICE");
+ break;
+ case WARNING:
+ prefix = _("WARNING");
+ break;
+ case ERROR:
+ prefix = _("ERROR");
+ break;
+ case ERROR2:
+ prefix = _("ERROR2");
+ break;
+ case FATAL:
+ prefix = _("FATAL");
+ break;
+ case PANIC:
+ prefix = _("PANIC");
+ break;
+ default:
+ prefix = "???";
+ break;
+ }
+
+ return prefix;
+}
+
+
+/*
+ * append_with_tabs
+ *
+ * Append the string to the StringInfo buffer, inserting a tab after any
+ * newline.
+ */
+static void
+append_with_tabs(StringInfo buf, const char *str)
+{
+ char ch;
+
+ while ((ch = *str++) != '\0')
+ {
+ appendStringInfoCharMacro(buf, ch);
+ if (ch == '\n')
+ appendStringInfoCharMacro(buf, '\t');
+ }
+}
+
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available). Used before ereport/elog can be used
+ * safely (memory context, GUC load etc)
+ */
+void
+write_stderr(const char *fmt,...)
+{
+ va_list ap;
+
+ fmt = _(fmt);
+
+ va_start(ap, fmt);
+
+ /* On Unix, we just fprintf to stderr */
+ vfprintf(stderr, fmt, ap);
+ fflush(stderr);
+ va_end(ap);
+}
+
+
+/*
+ * is_log_level_output -- is elevel logically >= log_min_level?
+ *
+ * We use this for tests that should consider LOG to sort out-of-order,
+ * between ERROR and FATAL. Generally this is the right thing for testing
+ * whether a message should go to the postmaster log, whereas a simple >=
+ * test is correct for testing whether the message should go to the client.
+ */
+static bool
+is_log_level_output(int elevel, int log_min_level)
+{
+ if (elevel == LOG || elevel == COMMERROR)
+ {
+ if (log_min_level == LOG || log_min_level <= ERROR)
+ return true;
+ }
+ else if (log_min_level == LOG)
+ {
+ /* elevel != LOG */
+ if (elevel >= FATAL)
+ return true;
+ }
+ /* Neither is LOG */
+ else if (elevel >= log_min_level)
+ return true;
+
+ return false;
+}
diff --git a/src/gtm/common/gtm_list.c b/src/gtm/common/gtm_list.c
new file mode 100644
index 0000000000..3ea2ce76cb
--- /dev/null
+++ b/src/gtm/common/gtm_list.c
@@ -0,0 +1,863 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_list.c
+ * implementation for PostgreSQL generic linked list package
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/nodes/list.c,v 1.70 2008/08/14 18:47:58 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_list.h"
+#include "gtm/memutils.h"
+#include "gtm/assert.h"
+
+#define equal(a, b) ((a) == (b))
+
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Check that the specified List is valid (so far as we can tell).
+ */
+static void
+check_list_invariants(List *list)
+{
+ if (list == NIL)
+ return;
+
+ Assert(list->length > 0);
+ Assert(list->head != NULL);
+ Assert(list->tail != NULL);
+
+ if (list->length == 1)
+ Assert(list->head == list->tail);
+ if (list->length == 2)
+ Assert(list->head->next == list->tail);
+ Assert(list->tail->next == NULL);
+}
+#else
+#define check_list_invariants(l)
+#endif /* USE_ASSERT_CHECKING */
+
+/*
+ * Return a freshly allocated List. Since empty non-NIL lists are
+ * invalid, new_list() also allocates the head cell of the new list:
+ * the caller should be sure to fill in that cell's data.
+ */
+static List *
+new_list()
+{
+ List *new_list;
+ ListCell *new_head;
+
+ new_head = (ListCell *) palloc(sizeof(*new_head));
+ new_head->next = NULL;
+ /* new_head->data is left undefined! */
+
+ new_list = (List *) palloc(sizeof(*new_list));
+ new_list->length = 1;
+ new_list->head = new_head;
+ new_list->tail = new_head;
+
+ return new_list;
+}
+
+/*
+ * Allocate a new cell and make it the head of the specified
+ * list. Assumes the list it is passed is non-NIL.
+ *
+ * The data in the new head cell is undefined; the caller should be
+ * sure to fill it in
+ */
+static void
+new_head_cell(List *list)
+{
+ ListCell *new_head;
+
+ new_head = (ListCell *) palloc(sizeof(*new_head));
+ new_head->next = list->head;
+
+ list->head = new_head;
+ list->length++;
+}
+
+/*
+ * Allocate a new cell and make it the tail of the specified
+ * list. Assumes the list it is passed is non-NIL.
+ *
+ * The data in the new tail cell is undefined; the caller should be
+ * sure to fill it in
+ */
+static void
+new_tail_cell(List *list)
+{
+ ListCell *new_tail;
+
+ new_tail = (ListCell *) palloc(sizeof(*new_tail));
+ new_tail->next = NULL;
+
+ list->tail->next = new_tail;
+ list->tail = new_tail;
+ list->length++;
+}
+
+/*
+ * Append a pointer to the list. A pointer to the modified list is
+ * returned. Note that this function may or may not destructively
+ * modify the list; callers should always use this function's return
+ * value, rather than continuing to use the pointer passed as the
+ * first argument.
+ */
+List *
+lappend(List *list, void *datum)
+{
+ if (list == NIL)
+ list = new_list();
+ else
+ new_tail_cell(list);
+
+ lfirst(list->tail) = datum;
+ check_list_invariants(list);
+ return list;
+}
+
+/*
+ * Add a new cell to the list, in the position after 'prev_cell'. The
+ * data in the cell is left undefined, and must be filled in by the
+ * caller. 'list' is assumed to be non-NIL, and 'prev_cell' is assumed
+ * to be non-NULL and a member of 'list'.
+ */
+static ListCell *
+add_new_cell(List *list, ListCell *prev_cell)
+{
+ ListCell *new_cell;
+
+ new_cell = (ListCell *) palloc(sizeof(*new_cell));
+ /* new_cell->data is left undefined! */
+ new_cell->next = prev_cell->next;
+ prev_cell->next = new_cell;
+
+ if (list->tail == prev_cell)
+ list->tail = new_cell;
+
+ list->length++;
+
+ return new_cell;
+}
+
+/*
+ * Add a new cell to the specified list (which must be non-NIL);
+ * it will be placed after the list cell 'prev' (which must be
+ * non-NULL and a member of 'list'). The data placed in the new cell
+ * is 'datum'. The newly-constructed cell is returned.
+ */
+ListCell *
+lappend_cell(List *list, ListCell *prev, void *datum)
+{
+ ListCell *new_cell;
+
+ new_cell = add_new_cell(list, prev);
+ lfirst(new_cell) = datum;
+ check_list_invariants(list);
+ return new_cell;
+}
+
+/*
+ * Prepend a new element to the list. A pointer to the modified list
+ * is returned. Note that this function may or may not destructively
+ * modify the list; callers should always use this function's return
+ * value, rather than continuing to use the pointer passed as the
+ * second argument.
+ */
+List *
+lcons(void *datum, List *list)
+{
+ if (list == NIL)
+ list = new_list();
+ else
+ new_head_cell(list);
+
+ lfirst(list->head) = datum;
+ check_list_invariants(list);
+ return list;
+}
+
+/*
+ * Concatenate list2 to the end of list1, and return list1. list1 is
+ * destructively changed. Callers should be sure to use the return
+ * value as the new pointer to the concatenated list: the 'list1'
+ * input pointer may or may not be the same as the returned pointer.
+ *
+ * The nodes in list2 are merely appended to the end of list1 in-place
+ * (i.e. they aren't copied; the two lists will share some of the same
+ * storage). Therefore, invoking list_free() on list2 will also
+ * invalidate a portion of list1.
+ */
+List *
+list_concat(List *list1, List *list2)
+{
+ if (list1 == NIL)
+ return list2;
+ if (list2 == NIL)
+ return list1;
+ if (list1 == list2)
+ elog(ERROR, "cannot list_concat() a list to itself");
+
+
+ list1->length += list2->length;
+ list1->tail->next = list2->head;
+ list1->tail = list2->tail;
+
+ check_list_invariants(list1);
+ return list1;
+}
+
+/*
+ * Truncate 'list' to contain no more than 'new_size' elements. This
+ * modifies the list in-place! Despite this, callers should use the
+ * pointer returned by this function to refer to the newly truncated
+ * list -- it may or may not be the same as the pointer that was
+ * passed.
+ *
+ * Note that any cells removed by list_truncate() are NOT pfree'd.
+ */
+List *
+list_truncate(List *list, int new_size)
+{
+ ListCell *cell;
+ int n;
+
+ if (new_size <= 0)
+ return NIL; /* truncate to zero length */
+
+ /* If asked to effectively extend the list, do nothing */
+ if (new_size >= list_length(list))
+ return list;
+
+ n = 1;
+ foreach(cell, list)
+ {
+ if (n == new_size)
+ {
+ cell->next = NULL;
+ list->tail = cell;
+ list->length = new_size;
+ check_list_invariants(list);
+ return list;
+ }
+ n++;
+ }
+
+ /* keep the compiler quiet; never reached */
+ Assert(false);
+ return list;
+}
+
+/*
+ * Locate the n'th cell (counting from 0) of the list. It is an assertion
+ * failure if there is no such cell.
+ */
+static ListCell *
+list_nth_cell(List *list, int n)
+{
+ ListCell *match;
+
+ Assert(list != NIL);
+ Assert(n >= 0);
+ Assert(n < list->length);
+ check_list_invariants(list);
+
+ /* Does the caller actually mean to fetch the tail? */
+ if (n == list->length - 1)
+ return list->tail;
+
+ for (match = list->head; n-- > 0; match = match->next)
+ ;
+
+ return match;
+}
+
+/*
+ * Return the data value contained in the n'th element of the
+ * specified list. (List elements begin at 0.)
+ */
+void *
+list_nth(List *list, int n)
+{
+ return lfirst(list_nth_cell(list, n));
+}
+
+/*
+ * Return true iff 'datum' is a member of the list. Equality is
+ * determined via equal(), so callers should ensure that they pass a
+ * Node as 'datum'.
+ */
+bool
+list_member(List *list, void *datum)
+{
+ ListCell *cell;
+
+ check_list_invariants(list);
+
+ foreach(cell, list)
+ {
+ if (equal(lfirst(cell), datum))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return true iff 'datum' is a member of the list. Equality is
+ * determined by using simple pointer comparison.
+ */
+bool
+list_member_ptr(List *list, void *datum)
+{
+ ListCell *cell;
+
+ check_list_invariants(list);
+
+ foreach(cell, list)
+ {
+ if (lfirst(cell) == datum)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Delete 'cell' from 'list'; 'prev' is the previous element to 'cell'
+ * in 'list', if any (i.e. prev == NULL iff list->head == cell)
+ *
+ * The cell is pfree'd, as is the List header if this was the last member.
+ */
+List *
+list_delete_cell(List *list, ListCell *cell, ListCell *prev)
+{
+ check_list_invariants(list);
+ Assert(prev != NULL ? lnext(prev) == cell : list_head(list) == cell);
+
+ /*
+ * If we're about to delete the last node from the list, free the whole
+ * list instead and return NIL, which is the only valid representation of
+ * a zero-length list.
+ */
+ if (list->length == 1)
+ {
+ list_free(list);
+ return NIL;
+ }
+
+ /*
+ * Otherwise, adjust the necessary list links, deallocate the particular
+ * node we have just removed, and return the list we were given.
+ */
+ list->length--;
+
+ if (prev)
+ prev->next = cell->next;
+ else
+ list->head = cell->next;
+
+ if (list->tail == cell)
+ list->tail = prev;
+
+ pfree(cell);
+ return list;
+}
+
+/*
+ * Delete the first cell in list that matches datum, if any.
+ * Equality is determined via equal().
+ */
+List *
+list_delete(List *list, void *datum)
+{
+ ListCell *cell;
+ ListCell *prev;
+
+ check_list_invariants(list);
+
+ prev = NULL;
+ foreach(cell, list)
+ {
+ if (equal(lfirst(cell), datum))
+ return list_delete_cell(list, cell, prev);
+
+ prev = cell;
+ }
+
+ /* Didn't find a match: return the list unmodified */
+ return list;
+}
+
+/* As above, but use simple pointer equality */
+List *
+list_delete_ptr(List *list, void *datum)
+{
+ ListCell *cell;
+ ListCell *prev;
+
+ check_list_invariants(list);
+
+ prev = NULL;
+ foreach(cell, list)
+ {
+ if (lfirst(cell) == datum)
+ return list_delete_cell(list, cell, prev);
+
+ prev = cell;
+ }
+
+ /* Didn't find a match: return the list unmodified */
+ return list;
+}
+
+
+/*
+ * Delete the first element of the list.
+ *
+ * This is useful to replace the Lisp-y code "list = lnext(list);" in cases
+ * where the intent is to alter the list rather than just traverse it.
+ * Beware that the removed cell is freed, whereas the lnext() coding leaves
+ * the original list head intact if there's another pointer to it.
+ */
+List *
+list_delete_first(List *list)
+{
+ check_list_invariants(list);
+
+ if (list == NIL)
+ return NIL; /* would an error be better? */
+
+ return list_delete_cell(list, list_head(list), NULL);
+}
+
+/*
+ * Generate the union of two lists. This is calculated by copying
+ * list1 via list_copy(), then adding to it all the members of list2
+ * that aren't already in list1.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ *
+ * The returned list is newly-allocated, although the content of the
+ * cells is the same (i.e. any pointed-to objects are not copied).
+ *
+ * NB: this function will NOT remove any duplicates that are present
+ * in list1 (so it only performs a "union" if list1 is known unique to
+ * start with). Also, if you are about to write "x = list_union(x, y)"
+ * you probably want to use list_concat_unique() instead to avoid wasting
+ * the list cells of the old x list.
+ *
+ * This function could probably be implemented a lot faster if it is a
+ * performance bottleneck.
+ */
+List *
+list_union(List *list1, List *list2)
+{
+ List *result;
+ ListCell *cell;
+
+ result = list_copy(list1);
+ foreach(cell, list2)
+ {
+ if (!list_member(result, lfirst(cell)))
+ result = lappend(result, lfirst(cell));
+ }
+
+ check_list_invariants(result);
+ return result;
+}
+
+/*
+ * This variant of list_union() determines duplicates via simple
+ * pointer comparison.
+ */
+List *
+list_union_ptr(List *list1, List *list2)
+{
+ List *result;
+ ListCell *cell;
+
+
+ result = list_copy(list1);
+ foreach(cell, list2)
+ {
+ if (!list_member_ptr(result, lfirst(cell)))
+ result = lappend(result, lfirst(cell));
+ }
+
+ check_list_invariants(result);
+ return result;
+}
+
+/*
+ * Return a list that contains all the cells that are in both list1 and
+ * list2. The returned list is freshly allocated via palloc(), but the
+ * cells themselves point to the same objects as the cells of the
+ * input lists.
+ *
+ * Duplicate entries in list1 will not be suppressed, so it's only a true
+ * "intersection" if list1 is known unique beforehand.
+ *
+ * This variant works on lists of pointers, and determines list
+ * membership via equal(). Note that the list1 member will be pointed
+ * to in the result.
+ */
+List *
+list_intersection(List *list1, List *list2)
+{
+ List *result;
+ ListCell *cell;
+
+ if (list1 == NIL || list2 == NIL)
+ return NIL;
+
+ result = NIL;
+ foreach(cell, list1)
+ {
+ if (list_member(list2, lfirst(cell)))
+ result = lappend(result, lfirst(cell));
+ }
+
+ check_list_invariants(result);
+ return result;
+}
+
+/*
+ * Return a list that contains all the cells in list1 that are not in
+ * list2. The returned list is freshly allocated via palloc(), but the
+ * cells themselves point to the same objects as the cells of the
+ * input lists.
+ *
+ * This variant works on lists of pointers, and determines list
+ * membership via equal()
+ */
+List *
+list_difference(List *list1, List *list2)
+{
+ ListCell *cell;
+ List *result = NIL;
+
+ if (list2 == NIL)
+ return list_copy(list1);
+
+ foreach(cell, list1)
+ {
+ if (!list_member(list2, lfirst(cell)))
+ result = lappend(result, lfirst(cell));
+ }
+
+ check_list_invariants(result);
+ return result;
+}
+
+/*
+ * This variant of list_difference() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_difference_ptr(List *list1, List *list2)
+{
+ ListCell *cell;
+ List *result = NIL;
+
+ if (list2 == NIL)
+ return list_copy(list1);
+
+ foreach(cell, list1)
+ {
+ if (!list_member_ptr(list2, lfirst(cell)))
+ result = lappend(result, lfirst(cell));
+ }
+
+ check_list_invariants(result);
+ return result;
+}
+
+/*
+ * Append datum to list, but only if it isn't already in the list.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ */
+List *
+list_append_unique(List *list, void *datum)
+{
+ if (list_member(list, datum))
+ return list;
+ else
+ return lappend(list, datum);
+}
+
+/*
+ * This variant of list_append_unique() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_append_unique_ptr(List *list, void *datum)
+{
+ if (list_member_ptr(list, datum))
+ return list;
+ else
+ return lappend(list, datum);
+}
+
+/*
+ * Append to list1 each member of list2 that isn't already in list1.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ *
+ * This is almost the same functionality as list_union(), but list1 is
+ * modified in-place rather than being copied. Note also that list2's cells
+ * are not inserted in list1, so the analogy to list_concat() isn't perfect.
+ */
+List *
+list_concat_unique(List *list1, List *list2)
+{
+ ListCell *cell;
+
+ foreach(cell, list2)
+ {
+ if (!list_member(list1, lfirst(cell)))
+ list1 = lappend(list1, lfirst(cell));
+ }
+
+ check_list_invariants(list1);
+ return list1;
+}
+
+/*
+ * This variant of list_concat_unique() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_concat_unique_ptr(List *list1, List *list2)
+{
+ ListCell *cell;
+
+ foreach(cell, list2)
+ {
+ if (!list_member_ptr(list1, lfirst(cell)))
+ list1 = lappend(list1, lfirst(cell));
+ }
+
+ check_list_invariants(list1);
+ return list1;
+}
+
+/*
+ * Free all storage in a list, and optionally the pointed-to elements
+ */
+static void
+list_free_private(List *list, bool deep)
+{
+ ListCell *cell;
+
+ check_list_invariants(list);
+
+ cell = list_head(list);
+ while (cell != NULL)
+ {
+ ListCell *tmp = cell;
+
+ cell = lnext(cell);
+ if (deep)
+ pfree(lfirst(tmp));
+ pfree(tmp);
+ }
+
+ if (list)
+ pfree(list);
+}
+
+/*
+ * Free all the cells of the list, as well as the list itself. Any
+ * objects that are pointed-to by the cells of the list are NOT
+ * free'd.
+ *
+ * On return, the argument to this function has been freed, so the
+ * caller would be wise to set it to NIL for safety's sake.
+ */
+void
+list_free(List *list)
+{
+ list_free_private(list, false);
+}
+
+/*
+ * Free all the cells of the list, the list itself, and all the
+ * objects pointed-to by the cells of the list (each element in the
+ * list must contain a pointer to a palloc()'d region of memory!)
+ *
+ * On return, the argument to this function has been freed, so the
+ * caller would be wise to set it to NIL for safety's sake.
+ */
+void
+list_free_deep(List *list)
+{
+ /*
+ * A "deep" free operation only makes sense on a list of pointers.
+ */
+ list_free_private(list, true);
+}
+
+/*
+ * Return a shallow copy of the specified list.
+ */
+List *
+list_copy(List *oldlist)
+{
+ List *newlist;
+ ListCell *newlist_prev;
+ ListCell *oldlist_cur;
+
+ if (oldlist == NIL)
+ return NIL;
+
+ newlist = new_list();
+ newlist->length = oldlist->length;
+
+ /*
+ * Copy over the data in the first cell; new_list() has already allocated
+ * the head cell itself
+ */
+ newlist->head->data = oldlist->head->data;
+
+ newlist_prev = newlist->head;
+ oldlist_cur = oldlist->head->next;
+ while (oldlist_cur)
+ {
+ ListCell *newlist_cur;
+
+ newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur));
+ newlist_cur->data = oldlist_cur->data;
+ newlist_prev->next = newlist_cur;
+
+ newlist_prev = newlist_cur;
+ oldlist_cur = oldlist_cur->next;
+ }
+
+ newlist_prev->next = NULL;
+ newlist->tail = newlist_prev;
+
+ check_list_invariants(newlist);
+ return newlist;
+}
+
+/*
+ * Return a shallow copy of the specified list, without the first N elements.
+ */
+List *
+list_copy_tail(List *oldlist, int nskip)
+{
+ List *newlist;
+ ListCell *newlist_prev;
+ ListCell *oldlist_cur;
+
+ if (nskip < 0)
+ nskip = 0; /* would it be better to elog? */
+
+ if (oldlist == NIL || nskip >= oldlist->length)
+ return NIL;
+
+ newlist = new_list();
+ newlist->length = oldlist->length - nskip;
+
+ /*
+ * Skip over the unwanted elements.
+ */
+ oldlist_cur = oldlist->head;
+ while (nskip-- > 0)
+ oldlist_cur = oldlist_cur->next;
+
+ /*
+ * Copy over the data in the first remaining cell; new_list() has already
+ * allocated the head cell itself
+ */
+ newlist->head->data = oldlist_cur->data;
+
+ newlist_prev = newlist->head;
+ oldlist_cur = oldlist_cur->next;
+ while (oldlist_cur)
+ {
+ ListCell *newlist_cur;
+
+ newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur));
+ newlist_cur->data = oldlist_cur->data;
+ newlist_prev->next = newlist_cur;
+
+ newlist_prev = newlist_cur;
+ oldlist_cur = oldlist_cur->next;
+ }
+
+ newlist_prev->next = NULL;
+ newlist->tail = newlist_prev;
+
+ check_list_invariants(newlist);
+ return newlist;
+}
+
+/*
+ * When using non-GCC compilers, we can't define these as inline
+ * functions in pg_list.h, so they are defined here.
+ *
+ * TODO: investigate supporting inlining for some non-GCC compilers.
+ */
+#ifndef __GNUC__
+
+ListCell *
+list_head(List *l)
+{
+ return l ? l->head : NULL;
+}
+
+ListCell *
+list_tail(List *l)
+{
+ return l ? l->tail : NULL;
+}
+
+int
+list_length(List *l)
+{
+ return l ? l->length : 0;
+}
+#endif /* ! __GNUC__ */
+
+/*
+ * Temporary compatibility functions
+ *
+ * In order to avoid warnings for these function definitions, we need
+ * to include a prototype here as well as in pg_list.h. That's because
+ * we don't enable list API compatibility in list.c, so we
+ * don't see the prototypes for these functions.
+ */
+
+/*
+ * Given a list, return its length. This is merely defined for the
+ * sake of backward compatibility: we can't afford to define a macro
+ * called "length", so it must be a function. New code should use the
+ * list_length() macro in order to avoid the overhead of a function
+ * call.
+ */
+int length(List *list);
+
+int
+length(List *list)
+{
+ return list_length(list);
+}
diff --git a/src/gtm/common/gtm_lock.c b/src/gtm/common/gtm_lock.c
new file mode 100644
index 0000000000..c919730c90
--- /dev/null
+++ b/src/gtm/common/gtm_lock.c
@@ -0,0 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_lock.c
+ * Handling for locks in GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/elog.h"
+
+/*
+ * Acquire the request lock. Block if the lock is not available
+ *
+ * TODO We should track the locks acquired in the thread specific context. If an
+ * error is thrown and cought, we don't want to keep holding to those locks
+ * since that would lead to a deadlock. Right now, we assume that the caller
+ * will appropriately catch errors and release the locks sanely.
+ */
+bool
+GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode)
+{
+ int status;
+
+ switch (mode)
+ {
+ case GTM_LOCKMODE_WRITE:
+ status = pthread_rwlock_wrlock(&lock->lk_lock);
+ break;
+
+ case GTM_LOCKMODE_READ:
+ status = pthread_rwlock_rdlock(&lock->lk_lock);
+ break;
+
+ default:
+ elog(ERROR, "Invalid lockmode");
+ break;
+ }
+
+ return status ? false : true;
+}
+
+/*
+ * Release previously acquired lock
+ */
+bool
+GTM_RWLockRelease(GTM_RWLock *lock)
+{
+ int status;
+ status = pthread_rwlock_unlock(&lock->lk_lock);
+ return status ? false : true;
+}
+
+/*
+ * Initialize a lock
+ */
+int
+GTM_RWLockInit(GTM_RWLock *lock)
+{
+ return pthread_rwlock_init(&lock->lk_lock, NULL);
+}
+
+/*
+ * Destroy a lock
+ */
+int
+GTM_RWLockDestroy(GTM_RWLock *lock)
+{
+ return pthread_rwlock_destroy(&lock->lk_lock);
+}
+
+/*
+ * Conditionally acquire a lock. If the lock is not available, the function
+ * immediately returns without blocking.
+ *
+ * Returns true if lock is successfully acquired. Otherwise returns false
+ */
+bool
+GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode)
+{
+ int status;
+
+ switch (mode)
+ {
+ case GTM_LOCKMODE_WRITE:
+ status = pthread_rwlock_trywrlock(&lock->lk_lock);
+ break;
+
+ case GTM_LOCKMODE_READ:
+ status = pthread_rwlock_tryrdlock(&lock->lk_lock);
+ break;
+
+ default:
+ elog(ERROR, "Invalid lockmode");
+ break;
+ }
+
+ return status ? false : true;
+}
+
+/*
+ * Initialize a mutex lock
+ */
+int
+GTM_MutexLockInit(GTM_MutexLock *lock)
+{
+ return pthread_mutex_init(&lock->lk_lock, NULL);
+}
+
+/*
+ * Destroy a mutex lock
+ */
+int
+GTM_MutexLockDestroy(GTM_MutexLock *lock)
+{
+ return pthread_mutex_destroy(&lock->lk_lock);
+}
+
+/*
+ * Acquire a mutex lock
+ *
+ * Return true if the lock is successfully acquired, else return false.
+ */
+bool
+GTM_MutexLockAcquire(GTM_MutexLock *lock)
+{
+ int status = pthread_mutex_lock(&lock->lk_lock);
+ return status ? false : true;
+}
+
+/*
+ * Release previously acquired lock
+ */
+bool
+GTM_MutexLockRelease(GTM_MutexLock *lock)
+{
+ return pthread_mutex_unlock(&lock->lk_lock);
+}
+
+/*
+ * Conditionally acquire a lock. If the lock is not available, the function
+ * immediately returns without blocking.
+ *
+ * Returns true if lock is successfully acquired. Otherwise returns false
+ */
+bool
+GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock)
+{
+ int status = pthread_mutex_trylock(&lock->lk_lock);
+ return status ? false : true;
+}
+
+/*
+ * Initialize a condition variable
+ */
+int
+GTM_CVInit(GTM_CV *cv)
+{
+ return pthread_cond_init(&cv->cv_condvar, NULL);
+}
+
+/*
+ * Destroy the conditional variable
+ */
+int
+GTM_CVDestroy(GTM_CV *cv)
+{
+ return pthread_cond_destroy(&cv->cv_condvar);
+}
+
+/*
+ * Wake up all the threads waiting on this conditional variable
+ */
+int
+GTM_CVBcast(GTM_CV *cv)
+{
+ return pthread_cond_broadcast(&cv->cv_condvar);
+}
+
+/*
+ * Wake up only one thread waiting on this conditional variable
+ */
+int
+GTM_CVSignal(GTM_CV *cv)
+{
+ return pthread_cond_signal(&cv->cv_condvar);
+}
+
+/*
+ * Wait on a conditional variable. The caller must have acquired the mutex lock
+ * already.
+ */
+int
+GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock)
+{
+ return pthread_cond_wait(&cv->cv_condvar, &lock->lk_lock);
+}
+
diff --git a/src/gtm/common/mcxt.c b/src/gtm/common/mcxt.c
new file mode 100644
index 0000000000..9325ae3c82
--- /dev/null
+++ b/src/gtm/common/mcxt.c
@@ -0,0 +1,763 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcxt.c
+ * POSTGRES memory context management code.
+ *
+ * This module handles context management operations that are independent
+ * of the particular kind of context being operated on. It calls
+ * context-type-specific operations via the function pointers in a
+ * context's MemoryContextMethods struct.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/utils/mmgr/mcxt.c,v 1.65 2008/06/28 16:45:22 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+
+#include "gtm/gtm_c.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+
+/*****************************************************************************
+ * GLOBAL MEMORY *
+ *****************************************************************************/
+
+/*
+ * Standard top-level contexts. For a description of the purpose of each
+ * of these contexts, refer to src/backend/utils/mmgr/README
+ */
+
+static void MemoryContextStatsInternal(MemoryContext context, int level);
+static void MemoryContextDeleteInternal(MemoryContext context, bool parent_locked);
+
+MemoryContext TopMostMemoryContext;
+
+/*****************************************************************************
+ * EXPORTED ROUTINES *
+ *****************************************************************************/
+
+
+/*
+ * MemoryContextInit
+ * Start up the memory-context subsystem.
+ *
+ * This must be called before creating contexts or allocating memory in
+ * contexts. TopMemoryContext and ErrorContext are initialized here;
+ * other contexts must be created afterwards.
+ *
+ * In normal multi-backend operation, this is called once during
+ * postmaster startup, and not at all by individual backend startup
+ * (since the backends inherit an already-initialized context subsystem
+ * by virtue of being forked off the postmaster).
+ *
+ * In a standalone backend this must be called during backend startup.
+ */
+void
+MemoryContextInit(void)
+{
+ AssertState(TopMemoryContext == NULL);
+
+ /*
+ * Initialize TopMemoryContext as an AllocSetContext with slow growth rate
+ * --- we don't really expect much to be allocated in it.
+ *
+ * (There is special-case code in MemoryContextCreate() for this call.)
+ *
+ * This context is shared between different threads and must be made
+ * thread-safe
+ */
+ TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL,
+ "TopMemoryContext",
+ 0,
+ 8 * 1024,
+ 8 * 1024,
+ true);
+
+ TopMostMemoryContext = TopMemoryContext;
+
+ /*
+ * Not having any other place to point CurrentMemoryContext, make it point
+ * to TopMemoryContext. Caller should change this soon!
+ */
+ CurrentMemoryContext = TopMemoryContext;
+
+ /*
+ * Initialize ErrorContext as an AllocSetContext with slow growth rate ---
+ * we don't really expect much to be allocated in it. More to the point,
+ * require it to contain at least 8K at all times. This is the only case
+ * where retained memory in a context is *essential* --- we want to be
+ * sure ErrorContext still has some memory even if we've run out
+ * elsewhere!
+ *
+ * Similar to TopMostMemoryContext, this context may as well be shared
+ * between threads
+ */
+ ErrorContext = AllocSetContextCreate(TopMemoryContext,
+ "ErrorContext",
+ 8 * 1024,
+ 8 * 1024,
+ 8 * 1024,
+ true);
+}
+
+/*
+ * MemoryContextReset
+ * Release all space allocated within a context and its descendants,
+ * but don't delete the contexts themselves.
+ *
+ * The type-specific reset routine handles the context itself, but we
+ * have to do the recursion for the children.
+ */
+void
+MemoryContextReset(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+ /* save a function call in common case where there are no children */
+ if (context->firstchild != NULL)
+ MemoryContextResetChildren(context);
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+
+ (*context->methods->reset) (context);
+}
+
+/*
+ * MemoryContextResetChildren
+ * Release all space allocated within a context's descendants,
+ * but don't delete the contexts themselves. The named context
+ * itself is not touched.
+ */
+void
+MemoryContextResetChildren(MemoryContext context)
+{
+ MemoryContext child;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ /*
+ * For a shared context, lock the parent context before resetting the
+ * children contextes
+ */
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ MemoryContextReset(child);
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+}
+
+/*
+ * MemoryContextDelete
+ * Delete a context and its descendants, and release all space
+ * allocated therein.
+ *
+ * The type-specific delete routine removes all subsidiary storage
+ * for the context, but we have to delete the context node itself,
+ * as well as recurse to get the children. We must also delink the
+ * node from its parent, if it has one.
+ */
+static void
+MemoryContextDeleteInternal(MemoryContext context, bool parent_locked)
+{
+ AssertArg(MemoryContextIsValid(context));
+ /* We had better not be deleting TopMemoryContext ... */
+ Assert(context != TopMemoryContext);
+ /* And not CurrentMemoryContext, either */
+ Assert(context != CurrentMemoryContext);
+
+ MemoryContextDeleteChildren(context);
+
+ /*
+ * We delink the context from its parent before deleting it, so that if
+ * there's an error we won't have deleted/busted contexts still attached
+ * to the context tree. Better a leak than a crash.
+ */
+ if (context->parent)
+ {
+ MemoryContext parent = context->parent;
+
+ /*
+ * If the parent context is shared and is already locked by the caller,
+ * no need to relock again. In fact, that's not the right thing to do
+ * since it will lead to a self-deadlock
+ */
+ if (MemoryContextIsShared(parent) && (!parent_locked))
+ MemoryContextLock(parent);
+
+ if (context == parent->firstchild)
+ parent->firstchild = context->nextchild;
+ else
+ {
+ MemoryContext child;
+
+ for (child = parent->firstchild; child; child = child->nextchild)
+ {
+ if (context == child->nextchild)
+ {
+ child->nextchild = context->nextchild;
+ break;
+ }
+ }
+ }
+
+ if (MemoryContextIsShared(parent) && (!parent_locked))
+ MemoryContextUnlock(parent);
+ }
+ (*context->methods->delete) (context);
+ pfree(context);
+}
+
+void
+MemoryContextDelete(MemoryContext context)
+{
+ MemoryContextDeleteInternal(context, false);
+}
+
+/*
+ * MemoryContextDeleteChildren
+ * Delete all the descendants of the named context and release all
+ * space allocated therein. The named context itself is not touched.
+ */
+void
+MemoryContextDeleteChildren(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ if (MemoryContextIsShared(context))
+ MemoryContextLock(context);
+ /*
+ * MemoryContextDelete will delink the child from me, so just iterate as
+ * long as there is a child.
+ *
+ * Since the parent is already locked, pass that information to the child
+ * which would then not attempt to relock the parent
+ */
+ while (context->firstchild != NULL)
+ MemoryContextDeleteInternal(context->firstchild, true);
+
+ if (MemoryContextIsShared(context))
+ MemoryContextUnlock(context);
+}
+
+/*
+ * MemoryContextResetAndDeleteChildren
+ * Release all space allocated within a context and delete all
+ * its descendants.
+ *
+ * This is a common combination case where we want to preserve the
+ * specific context but get rid of absolutely everything under it.
+ */
+void
+MemoryContextResetAndDeleteChildren(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ MemoryContextDeleteChildren(context);
+ (*context->methods->reset) (context);
+}
+
+/*
+ * GetMemoryChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ *
+ * This is useful for measuring the total space occupied by a set of
+ * allocated chunks.
+ */
+Size
+GetMemoryChunkSpace(void *pointer)
+{
+ StandardChunkHeader *header;
+
+ /*
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ Assert(pointer != NULL);
+ Assert(pointer == (void *) MAXALIGN(pointer));
+
+ /*
+ * OK, it's probably safe to look at the chunk header.
+ */
+ header = (StandardChunkHeader *)
+ ((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+ AssertArg(MemoryContextIsValid(header->context));
+
+ return (*header->context->methods->get_chunk_space) (header->context,
+ pointer);
+}
+
+/*
+ * GetMemoryChunkContext
+ * Given a currently-allocated chunk, determine the context
+ * it belongs to.
+ */
+MemoryContext
+GetMemoryChunkContext(void *pointer)
+{
+ StandardChunkHeader *header;
+
+ /*
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ Assert(pointer != NULL);
+ Assert(pointer == (void *) MAXALIGN(pointer));
+
+ /*
+ * OK, it's probably safe to look at the chunk header.
+ */
+ header = (StandardChunkHeader *)
+ ((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+ AssertArg(MemoryContextIsValid(header->context));
+
+ return header->context;
+}
+
+/*
+ * MemoryContextIsEmpty
+ * Is a memory context empty of any allocated space?
+ */
+bool
+MemoryContextIsEmpty(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /*
+ * For now, we consider a memory context nonempty if it has any children;
+ * perhaps this should be changed later.
+ */
+ if (context->firstchild != NULL)
+ return false;
+ /* Otherwise use the type-specific inquiry */
+ return (*context->methods->is_empty) (context);
+}
+
+/*
+ * MemoryContextStats
+ * Print statistics about the named context and all its descendants.
+ *
+ * This is just a debugging utility, so it's not fancy. The statistics
+ * are merely sent to stderr.
+ */
+void
+MemoryContextStats(MemoryContext context)
+{
+ MemoryContextStatsInternal(context, 0);
+}
+
+static void
+MemoryContextStatsInternal(MemoryContext context, int level)
+{
+ MemoryContext child;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ (*context->methods->stats) (context, level);
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ MemoryContextStatsInternal(child, level + 1);
+}
+
+/*
+ * MemoryContextCheck
+ * Check all chunks in the named context.
+ *
+ * This is just a debugging utility, so it's not fancy.
+ */
+#ifdef MEMORY_CONTEXT_CHECKING
+void
+MemoryContextCheck(MemoryContext context)
+{
+ MemoryContext child;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ (*context->methods->check) (context);
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ MemoryContextCheck(child);
+}
+#endif
+
+/*
+ * MemoryContextContains
+ * Detect whether an allocated chunk of memory belongs to a given
+ * context or not.
+ *
+ * Caution: this test is reliable as long as 'pointer' does point to
+ * a chunk of memory allocated from *some* context. If 'pointer' points
+ * at memory obtained in some other way, there is a small chance of a
+ * false-positive result, since the bits right before it might look like
+ * a valid chunk header by chance.
+ */
+bool
+MemoryContextContains(MemoryContext context, void *pointer)
+{
+ StandardChunkHeader *header;
+
+ /*
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ if (pointer == NULL || pointer != (void *) MAXALIGN(pointer))
+ return false;
+
+ /*
+ * OK, it's probably safe to look at the chunk header.
+ */
+ header = (StandardChunkHeader *)
+ ((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+ /*
+ * If the context link doesn't match then we certainly have a non-member
+ * chunk. Also check for a reasonable-looking size as extra guard against
+ * being fooled by bogus pointers.
+ */
+ if (header->context == context && AllocSizeIsValid(header->size))
+ return true;
+ return false;
+}
+
+/*--------------------
+ * MemoryContextCreate
+ * Context-type-independent part of context creation.
+ *
+ * This is only intended to be called by context-type-specific
+ * context creation routines, not by the unwashed masses.
+ *
+ * The context creation procedure is a little bit tricky because
+ * we want to be sure that we don't leave the context tree invalid
+ * in case of failure (such as insufficient memory to allocate the
+ * context node itself). The procedure goes like this:
+ * 1. Context-type-specific routine first calls MemoryContextCreate(),
+ * passing the appropriate tag/size/methods values (the methods
+ * pointer will ordinarily point to statically allocated data).
+ * The parent and name parameters usually come from the caller.
+ * 2. MemoryContextCreate() attempts to allocate the context node,
+ * plus space for the name. If this fails we can ereport() with no
+ * damage done.
+ * 3. We fill in all of the type-independent MemoryContext fields.
+ * 4. We call the type-specific init routine (using the methods pointer).
+ * The init routine is required to make the node minimally valid
+ * with zero chance of failure --- it can't allocate more memory,
+ * for example.
+ * 5. Now we have a minimally valid node that can behave correctly
+ * when told to reset or delete itself. We link the node to its
+ * parent (if any), making the node part of the context tree.
+ * 6. We return to the context-type-specific routine, which finishes
+ * up type-specific initialization. This routine can now do things
+ * that might fail (like allocate more memory), so long as it's
+ * sure the node is left in a state that delete will handle.
+ *
+ * This protocol doesn't prevent us from leaking memory if step 6 fails
+ * during creation of a top-level context, since there's no parent link
+ * in that case. However, if you run out of memory while you're building
+ * a top-level context, you might as well go home anyway...
+ *
+ * Normally, the context node and the name are allocated from
+ * TopMemoryContext (NOT from the parent context, since the node must
+ * survive resets of its parent context!). However, this routine is itself
+ * used to create TopMemoryContext! If we see that TopMemoryContext is NULL,
+ * we assume we are creating TopMemoryContext and use malloc() to allocate
+ * the node.
+ *
+ * Note that the name field of a MemoryContext does not point to
+ * separately-allocated storage, so it should not be freed at context
+ * deletion.
+ *--------------------
+ */
+MemoryContext
+MemoryContextCreate(Size size,
+ MemoryContextMethods *methods,
+ MemoryContext parent,
+ const char *name)
+{
+ MemoryContext node;
+ Size needed = size + strlen(name) + 1;
+
+
+ /* Get space for node and name */
+ if (TopMemoryContext != NULL)
+ {
+ /* Normal case: allocate the node in TopMemoryContext */
+ node = (MemoryContext) MemoryContextAlloc(TopMemoryContext,
+ needed);
+ }
+ else
+ {
+ /* Special case for startup: use good ol' malloc */
+ node = (MemoryContext) malloc(needed);
+ Assert(node != NULL);
+ }
+
+ /* Initialize the node as best we can */
+ MemSet(node, 0, size);
+ node->methods = methods;
+ node->parent = NULL; /* for the moment */
+ node->firstchild = NULL;
+ node->nextchild = NULL;
+ node->name = ((char *) node) + size;
+ strcpy(node->name, name);
+
+ /* Type-specific routine finishes any other essential initialization */
+ (*node->methods->init) (node);
+
+ /*
+ * Lock the parent context if the it is shared and must be made thread-safe
+ */
+ if ((parent != NULL) && (MemoryContextIsShared(parent)))
+ MemoryContextLock(parent);
+
+ /* OK to link node to parent (if any) */
+ if (parent)
+ {
+ node->parent = parent;
+ node->nextchild = parent->firstchild;
+ parent->firstchild = node;
+ }
+
+ if ((parent != NULL) && (MemoryContextIsShared(parent)))
+ MemoryContextUnlock(parent);
+
+ /* Return to type-specific creation routine to finish up */
+ return node;
+}
+
+/*
+ * MemoryContextAlloc
+ * Allocate space within the specified context.
+ *
+ * This could be turned into a macro, but we'd have to import
+ * nodes/memnodes.h into postgres.h which seems a bad idea.
+ */
+void *
+MemoryContextAlloc(MemoryContext context, Size size)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %lu",
+ (unsigned long) size);
+
+ return (*context->methods->alloc) (context, size);
+}
+
+/*
+ * MemoryContextAllocZero
+ * Like MemoryContextAlloc, but clears allocated memory
+ *
+ * We could just call MemoryContextAlloc then clear the memory, but this
+ * is a very common combination, so we provide the combined operation.
+ */
+void *
+MemoryContextAllocZero(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %lu",
+ (unsigned long) size);
+
+ ret = (*context->methods->alloc) (context, size);
+
+ MemSetAligned(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextAllocZeroAligned
+ * MemoryContextAllocZero where length is suitable for MemSetLoop
+ *
+ * This might seem overly specialized, but it's not because newNode()
+ * is so often called with compile-time-constant sizes.
+ */
+void *
+MemoryContextAllocZeroAligned(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %lu",
+ (unsigned long) size);
+
+ ret = (*context->methods->alloc) (context, size);
+
+ MemSetLoop(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * pfree
+ * Release an allocated chunk.
+ */
+void
+pfree(void *pointer)
+{
+ StandardChunkHeader *header;
+
+ /*
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ Assert(pointer != NULL);
+ Assert(pointer == (void *) MAXALIGN(pointer));
+
+ /*
+ * OK, it's probably safe to look at the chunk header.
+ */
+ header = (StandardChunkHeader *)
+ ((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+ AssertArg(MemoryContextIsValid(header->context));
+
+ (*header->context->methods->free_p) (header->context, pointer);
+}
+
+/*
+ * repalloc
+ * Adjust the size of a previously allocated chunk.
+ */
+void *
+repalloc(void *pointer, Size size)
+{
+ StandardChunkHeader *header;
+
+ /*
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ Assert(pointer != NULL);
+ Assert(pointer == (void *) MAXALIGN(pointer));
+
+ /*
+ * OK, it's probably safe to look at the chunk header.
+ */
+ header = (StandardChunkHeader *)
+ ((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+ AssertArg(MemoryContextIsValid(header->context));
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %lu",
+ (unsigned long) size);
+
+ return (*header->context->methods->realloc) (header->context,
+ pointer, size);
+}
+
+/*
+ * MemoryContextSwitchTo
+ * Returns the current context; installs the given context.
+ *
+ * This is inlined when using GCC.
+ *
+ * TODO: investigate supporting inlining for some non-GCC compilers.
+ */
+MemoryContext
+MemoryContextSwitchTo(MemoryContext context)
+{
+ MemoryContext old;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ old = CurrentMemoryContext;
+ CurrentMemoryContext = context;
+ return old;
+}
+
+/*
+ * MemoryContextStrdup
+ * Like strdup(), but allocate from the specified context
+ */
+char *
+MemoryContextStrdup(MemoryContext context, const char *string)
+{
+ char *nstr;
+ Size len = strlen(string) + 1;
+
+ nstr = (char *) MemoryContextAlloc(context, len);
+
+ memcpy(nstr, string, len);
+
+ return nstr;
+}
+
+/*
+ * pnstrdup
+ * Like pstrdup(), but append null byte to a
+ * not-necessarily-null-terminated input string.
+ */
+char *
+pnstrdup(const char *in, Size len)
+{
+ char *out = palloc(len + 1);
+
+ memcpy(out, in, len);
+ out[len] = '\0';
+ return out;
+}
+
+
+#if defined(WIN32) || defined(__CYGWIN__)
+/*
+ * Memory support routines for libpgport on Win32
+ *
+ * Win32 can't load a library that PGDLLIMPORTs a variable
+ * if the link object files also PGDLLIMPORT the same variable.
+ * For this reason, libpgport can't reference CurrentMemoryContext
+ * in the palloc macro calls.
+ *
+ * To fix this, we create several functions here that allow us to
+ * manage memory without doing the inline in libpgport.
+ */
+void *
+pgport_palloc(Size sz)
+{
+ return palloc(sz);
+}
+
+
+char *
+pgport_pstrdup(const char *str)
+{
+ return pstrdup(str);
+}
+
+
+/* Doesn't reference a PGDLLIMPORT variable, but here for completeness. */
+void
+pgport_pfree(void *pointer)
+{
+ pfree(pointer);
+}
+
+#endif
diff --git a/src/gtm/common/stringinfo.c b/src/gtm/common/stringinfo.c
new file mode 100644
index 0000000000..5023bd9893
--- /dev/null
+++ b/src/gtm/common/stringinfo.c
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * stringinfo.c
+ *
+ * StringInfo provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data. All storage is allocated with palloc().
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/backend/lib/stringinfo.c,v 1.49 2008/01/01 19:45:49 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/stringinfo.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+
+/*
+ * makeStringInfo
+ *
+ * Create an empty 'StringInfoData' & return a pointer to it.
+ */
+StringInfo
+makeStringInfo(void)
+{
+ StringInfo res;
+
+ res = (StringInfo) palloc(sizeof(StringInfoData));
+
+ initStringInfo(res);
+
+ return res;
+}
+
+/*
+ * initStringInfo
+ *
+ * Initialize a StringInfoData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+void
+initStringInfo(StringInfo str)
+{
+ int size = 1024; /* initial default buffer size */
+
+ str->data = (char *) palloc(size);
+ str->maxlen = size;
+ resetStringInfo(str);
+}
+
+/*
+ * resetStringInfo
+ *
+ * Reset the StringInfo: the data buffer remains valid, but its
+ * previous content, if any, is cleared.
+ */
+void
+resetStringInfo(StringInfo str)
+{
+ str->data[0] = '\0';
+ str->len = 0;
+ str->cursor = 0;
+}
+
+/*
+ * appendStringInfo
+ *
+ * Format text data under the control of fmt (an sprintf-style format string)
+ * and append it to whatever is already in str. More space is allocated
+ * to str if necessary. This is sort of like a combination of sprintf and
+ * strcat.
+ */
+void
+appendStringInfo(StringInfo str, const char *fmt,...)
+{
+ for (;;)
+ {
+ va_list args;
+ bool success;
+
+ /* Try to format the data. */
+ va_start(args, fmt);
+ success = appendStringInfoVA(str, fmt, args);
+ va_end(args);
+
+ if (success)
+ break;
+
+ /* Double the buffer size and try again. */
+ enlargeStringInfo(str, str->maxlen);
+ }
+}
+
+/*
+ * appendStringInfoVA
+ *
+ * Attempt to format text data under the control of fmt (an sprintf-style
+ * format string) and append it to whatever is already in str. If successful
+ * return true; if not (because there's not enough space), return false
+ * without modifying str. Typically the caller would enlarge str and retry
+ * on false return --- see appendStringInfo for standard usage pattern.
+ *
+ * XXX This API is ugly, but there seems no alternative given the C spec's
+ * restrictions on what can portably be done with va_list arguments: you have
+ * to redo va_start before you can rescan the argument list, and we can't do
+ * that from here.
+ */
+bool
+appendStringInfoVA(StringInfo str, const char *fmt, va_list args)
+{
+ int avail,
+ nprinted;
+
+ Assert(str != NULL);
+
+ /*
+ * If there's hardly any space, don't bother trying, just fail to make the
+ * caller enlarge the buffer first.
+ */
+ avail = str->maxlen - str->len - 1;
+ if (avail < 16)
+ return false;
+
+ /*
+ * Assert check here is to catch buggy vsnprintf that overruns the
+ * specified buffer length. Solaris 7 in 64-bit mode is an example of a
+ * platform with such a bug.
+ */
+#ifdef USE_ASSERT_CHECKING
+ str->data[str->maxlen - 1] = '\0';
+#endif
+
+ nprinted = vsnprintf(str->data + str->len, avail, fmt, args);
+
+ Assert(str->data[str->maxlen - 1] == '\0');
+
+ /*
+ * Note: some versions of vsnprintf return the number of chars actually
+ * stored, but at least one returns -1 on failure. Be conservative about
+ * believing whether the print worked.
+ */
+ if (nprinted >= 0 && nprinted < avail - 1)
+ {
+ /* Success. Note nprinted does not include trailing null. */
+ str->len += nprinted;
+ return true;
+ }
+
+ /* Restore the trailing null so that str is unmodified. */
+ str->data[str->len] = '\0';
+ return false;
+}
+
+/*
+ * appendStringInfoString
+ *
+ * Append a null-terminated string to str.
+ * Like appendStringInfo(str, "%s", s) but faster.
+ */
+void
+appendStringInfoString(StringInfo str, const char *s)
+{
+ appendBinaryStringInfo(str, s, strlen(s));
+}
+
+/*
+ * appendStringInfoChar
+ *
+ * Append a single byte to str.
+ * Like appendStringInfo(str, "%c", ch) but much faster.
+ */
+void
+appendStringInfoChar(StringInfo str, char ch)
+{
+ /* Make more room if needed */
+ if (str->len + 1 >= str->maxlen)
+ enlargeStringInfo(str, 1);
+
+ /* OK, append the character */
+ str->data[str->len] = ch;
+ str->len++;
+ str->data[str->len] = '\0';
+}
+
+/*
+ * appendBinaryStringInfo
+ *
+ * Append arbitrary binary data to a StringInfo, allocating more space
+ * if necessary.
+ */
+void
+appendBinaryStringInfo(StringInfo str, const char *data, int datalen)
+{
+ Assert(str != NULL);
+
+ /* Make more room if needed */
+ enlargeStringInfo(str, datalen);
+
+ /* OK, append the data */
+ memcpy(str->data + str->len, data, datalen);
+ str->len += datalen;
+
+ /*
+ * Keep a trailing null in place, even though it's probably useless for
+ * binary data...
+ */
+ str->data[str->len] = '\0';
+}
+
+/*
+ * enlargeStringInfo
+ *
+ * Make sure there is enough space for 'needed' more bytes
+ * ('needed' does not include the terminating null).
+ *
+ * External callers usually need not concern themselves with this, since
+ * all stringinfo.c routines do it automatically. However, if a caller
+ * knows that a StringInfo will eventually become X bytes large, it
+ * can save some palloc overhead by enlarging the buffer before starting
+ * to store data in it.
+ *
+ * NB: because we use repalloc() to enlarge the buffer, the string buffer
+ * will remain allocated in the same memory context that was current when
+ * initStringInfo was called, even if another context is now current.
+ * This is the desired and indeed critical behavior!
+ */
+void
+enlargeStringInfo(StringInfo str, int needed)
+{
+ int newlen;
+
+ /*
+ * Guard against out-of-range "needed" values. Without this, we can get
+ * an overflow or infinite loop in the following.
+ */
+ if (needed < 0) /* should not happen */
+ elog(ERROR, "invalid string enlargement request size: %d", needed);
+ if (((Size) needed) >= (MaxAllocSize - (Size) str->len))
+ ereport(ERROR,
+ (ENOSPC,
+ errmsg("out of memory"),
+ errdetail("Cannot enlarge string buffer containing %d bytes by %d more bytes.",
+ str->len, needed)));
+
+ needed += str->len + 1; /* total space required now */
+
+ /* Because of the above test, we now have needed <= MaxAllocSize */
+
+ if (needed <= str->maxlen)
+ return; /* got enough space already */
+
+ /*
+ * We don't want to allocate just a little more space with each append;
+ * for efficiency, double the buffer size each time it overflows.
+ * Actually, we might need to more than double it if 'needed' is big...
+ */
+ newlen = 2 * str->maxlen;
+ while (needed > newlen)
+ newlen = 2 * newlen;
+
+ /*
+ * Clamp to MaxAllocSize in case we went past it. Note we are assuming
+ * here that MaxAllocSize <= INT_MAX/2, else the above loop could
+ * overflow. We will still have newlen >= needed.
+ */
+ if (newlen > (int) MaxAllocSize)
+ newlen = (int) MaxAllocSize;
+
+ str->data = (char *) repalloc(str->data, newlen);
+
+ str->maxlen = newlen;
+}
diff --git a/src/gtm/gtm_ctl/Makefile b/src/gtm/gtm_ctl/Makefile
new file mode 100644
index 0000000000..eddcc9aebe
--- /dev/null
+++ b/src/gtm/gtm_ctl/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=gtm_ctl.o ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm_ctl:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_ctl
+
+all:gtm_ctl
+
+clean:
+ rm -f $(OBJS)
+ rm -f gtm_ctl
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
new file mode 100644
index 0000000000..3b01796484
--- /dev/null
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -0,0 +1,918 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_ctl --- start/stops/restarts the GTM server/proxy
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+
+#include <locale.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include "libpq/pqsignal.h"
+
+/* PID can be negative for standalone backend */
+typedef long pgpid_t;
+
+typedef enum
+{
+ SMART_MODE,
+ FAST_MODE,
+ IMMEDIATE_MODE
+} ShutdownMode;
+
+
+typedef enum
+{
+ NO_COMMAND = 0,
+ START_COMMAND,
+ STOP_COMMAND,
+ RESTART_COMMAND,
+} CtlCommand;
+
+#define DEFAULT_WAIT 60
+
+static bool do_wait = false;
+static bool wait_set = false;
+static int wait_seconds = DEFAULT_WAIT;
+static bool silent_mode = false;
+static ShutdownMode shutdown_mode = SMART_MODE;
+static int sig = SIGTERM; /* default */
+static CtlCommand ctl_command = NO_COMMAND;
+static char *gtm_data = NULL;
+static char *gtmdata_opt = NULL;
+static char *gtm_opts = NULL;
+static const char *progname;
+static char *log_file = NULL;
+static char *gtm_path = NULL;
+static char *gtm_app = NULL;
+static char *argv0 = NULL;
+
+static void
+write_stderr(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+static void *pg_malloc(size_t size);
+static char *xstrdup(const char *s);
+static void do_advice(void);
+static void do_help(void);
+static void set_mode(char *modeopt);
+static void do_start(void);
+static void do_stop(void);
+static void do_restart(void);
+static void print_msg(const char *msg);
+
+static pgpid_t get_pgpid(void);
+static char **readfile(const char *path);
+static int start_gtm(void);
+static void read_gtm_opts(void);
+
+static bool test_gtm_connection();
+static bool gtm_is_alive(pid_t pid);
+
+static char gtmopts_file[MAXPGPATH];
+static char pid_file[MAXPGPATH];
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available).
+ */
+static void
+write_stderr(const char *fmt,...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ /* On Unix, we just fprintf to stderr */
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * routines to check memory allocations and fail noisily.
+ */
+
+static void *
+pg_malloc(size_t size)
+{
+ void *result;
+
+ result = malloc(size);
+ if (!result)
+ {
+ write_stderr(_("%s: out of memory\n"), progname);
+ exit(1);
+ }
+ return result;
+}
+
+
+static char *
+xstrdup(const char *s)
+{
+ char *result;
+
+ result = strdup(s);
+ if (!result)
+ {
+ write_stderr(_("%s: out of memory\n"), progname);
+ exit(1);
+ }
+ return result;
+}
+
+/*
+ * Given an already-localized string, print it to stdout unless the
+ * user has specified that no messages should be printed.
+ */
+static void
+print_msg(const char *msg)
+{
+ if (!silent_mode)
+ {
+ fputs(msg, stdout);
+ fflush(stdout);
+ }
+}
+
+static pgpid_t
+get_pgpid(void)
+{
+ FILE *pidf;
+ long pid;
+
+ pidf = fopen(pid_file, "r");
+ if (pidf == NULL)
+ {
+ /* No pid file, not an error on startup */
+ if (errno == ENOENT)
+ return 0;
+ else
+ {
+ write_stderr(_("%s: could not open PID file \"%s\": %s\n"),
+ progname, pid_file, strerror(errno));
+ exit(1);
+ }
+ }
+ if (fscanf(pidf, "%ld", &pid) != 1)
+ {
+ write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+ progname, pid_file);
+ exit(1);
+ }
+ fclose(pidf);
+ return (pgpid_t) pid;
+}
+
+
+/*
+ * get the lines from a text file - return NULL if file can't be opened
+ */
+static char **
+readfile(const char *path)
+{
+ FILE *infile;
+ int maxlength = 0,
+ linelen = 0;
+ int nlines = 0;
+ char **result;
+ char *buffer;
+ int c;
+
+ if ((infile = fopen(path, "r")) == NULL)
+ return NULL;
+
+ /* pass over the file twice - the first time to size the result */
+
+ while ((c = fgetc(infile)) != EOF)
+ {
+ linelen++;
+ if (c == '\n')
+ {
+ nlines++;
+ if (linelen > maxlength)
+ maxlength = linelen;
+ linelen = 0;
+ }
+ }
+
+ /* handle last line without a terminating newline (yuck) */
+ if (linelen)
+ nlines++;
+ if (linelen > maxlength)
+ maxlength = linelen;
+
+ /* set up the result and the line buffer */
+ result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
+ buffer = (char *) pg_malloc(maxlength + 1);
+
+ /* now reprocess the file and store the lines */
+ rewind(infile);
+ nlines = 0;
+ while (fgets(buffer, maxlength + 1, infile) != NULL)
+ result[nlines++] = xstrdup(buffer);
+
+ fclose(infile);
+ free(buffer);
+ result[nlines] = NULL;
+
+ return result;
+}
+
+
+
+/*
+ * start/test/stop routines
+ */
+
+static int
+start_gtm(void)
+{
+ char cmd[MAXPGPATH];
+ /*
+ * Since there might be quotes to handle here, it is easier simply to pass
+ * everything to a shell to process them.
+ */
+
+ if (gtm_path != NULL)
+ {
+ strcat(gtm_path, "/");
+ strcat(gtm_path, gtm_app);
+ }
+ else
+ gtm_path = gtm_app;
+
+ if (log_file != NULL)
+ snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
+ gtm_path, gtmdata_opt, gtm_opts,
+ DEVNULL, log_file);
+ else
+ snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
+ gtm_path, gtmdata_opt, gtm_opts, DEVNULL);
+
+ return system(cmd);
+}
+
+
+
+/*
+ * Find the pgport and try a connection
+ */
+static bool
+test_gtm_connection()
+{
+ GTM_Conn *conn;
+ bool success = false;
+ int i;
+ char portstr[32];
+ char *p;
+ char *q;
+ char connstr[128]; /* Should be way more than enough! */
+
+ *portstr = '\0';
+
+ /*
+ * Look in gtm_opts for a -p switch.
+ *
+ * This parsing code is not amazingly bright; it could for instance
+ * get fooled if ' -p' occurs within a quoted argument value. Given
+ * that few people pass complicated settings in gtm_opts, it's
+ * probably good enough.
+ */
+ for (p = gtm_opts; *p;)
+ {
+ /* advance past whitespace */
+ while (isspace((unsigned char) *p))
+ p++;
+
+ if (strncmp(p, "-p", 2) == 0)
+ {
+ p += 2;
+ /* advance past any whitespace/quoting */
+ while (isspace((unsigned char) *p) || *p == '\'' || *p == '"')
+ p++;
+ /* find end of value (not including any ending quote!) */
+ q = p;
+ while (*q &&
+ !(isspace((unsigned char) *q) || *q == '\'' || *q == '"'))
+ q++;
+ /* and save the argument value */
+ strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr)));
+ /* keep looking, maybe there is another -p */
+ p = q;
+ }
+ /* Advance to next whitespace */
+ while (*p && !isspace((unsigned char) *p))
+ p++;
+ }
+
+ /*
+ * We need to set a connect timeout otherwise on Windows the SCM will
+ * probably timeout first
+ */
+ snprintf(connstr, sizeof(connstr),
+ "host=localhost port=%s connect_timeout=5", portstr);
+
+ for (i = 0; i < wait_seconds; i++)
+ {
+ if ((conn = PQconnectGTM(connstr)) != NULL &&
+ (GTMPQstatus(conn) == CONNECTION_OK))
+ {
+ GTMPQfinish(conn);
+ success = true;
+ break;
+ }
+ else
+ {
+ GTMPQfinish(conn);
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ }
+
+ return success;
+}
+
+static void
+read_gtm_opts(void)
+{
+ if (gtm_opts == NULL)
+ {
+ gtm_opts = ""; /* default */
+ if (ctl_command == RESTART_COMMAND)
+ {
+ char **optlines;
+
+ optlines = readfile(gtmopts_file);
+ if (optlines == NULL)
+ {
+ write_stderr(_("%s: could not read file \"%s\"\n"), progname, gtmopts_file);
+ exit(1);
+ }
+ else if (optlines[0] == NULL || optlines[1] != NULL)
+ {
+ write_stderr(_("%s: option file \"%s\" must have exactly one line\n"),
+ progname, gtmopts_file);
+ exit(1);
+ }
+ else
+ {
+ int len;
+ char *optline;
+ char *arg1;
+
+ optline = optlines[0];
+ /* trim off line endings */
+ len = strcspn(optline, "\r\n");
+ optline[len] = '\0';
+
+ gtm_opts = arg1;
+ }
+ }
+ }
+}
+
+static void
+do_start(void)
+{
+ pgpid_t pid;
+ pgpid_t old_pid = 0;
+ int exitcode;
+
+ if (ctl_command != RESTART_COMMAND)
+ {
+ old_pid = get_pgpid();
+ if (old_pid != 0)
+ write_stderr(_("%s: another server might be running; "
+ "trying to start server anyway\n"),
+ progname);
+ }
+
+ read_gtm_opts();
+
+ exitcode = start_gtm();
+ if (exitcode != 0)
+ {
+ write_stderr(_("%s: could not start server: exit code was %d\n"),
+ progname, exitcode);
+ exit(1);
+ }
+
+ if (old_pid != 0)
+ {
+ sleep(1);
+ pid = get_pgpid();
+ if (pid == old_pid)
+ {
+ write_stderr(_("%s: could not start server\n"
+ "Examine the log output.\n"),
+ progname);
+ exit(1);
+ }
+ }
+
+ if (do_wait)
+ {
+ print_msg(_("waiting for server to start..."));
+
+ if (test_gtm_connection() == false)
+ {
+ printf(_("could not start server\n"));
+ exit(1);
+ }
+ else
+ {
+ print_msg(_(" done\n"));
+ print_msg(_("server started\n"));
+ }
+ }
+ else
+ print_msg(_("server starting\n"));
+}
+
+
+static void
+do_stop(void)
+{
+ int cnt;
+ pgpid_t pid;
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ write_stderr(_("%s: cannot stop server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ exit(1);
+ }
+
+ if (kill((pid_t) pid, sig) != 0)
+ {
+ write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+
+ if (!do_wait)
+ {
+ print_msg(_("server shutting down\n"));
+ return;
+ }
+ else
+ {
+ print_msg(_("waiting for server to shut down..."));
+
+ for (cnt = 0; cnt < wait_seconds; cnt++)
+ {
+ if ((pid = get_pgpid()) != 0)
+ {
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ else
+ break;
+ }
+
+ if (pid != 0) /* pid file still exists */
+ {
+ print_msg(_(" failed\n"));
+
+ write_stderr(_("%s: server does not shut down\n"), progname);
+ exit(1);
+ }
+ print_msg(_(" done\n"));
+
+ printf(_("server stopped\n"));
+ }
+}
+
+
+/*
+ * restart/reload routines
+ */
+
+static void
+do_restart(void)
+{
+ int cnt;
+ pgpid_t pid;
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"),
+ progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ write_stderr(_("starting server anyway\n"));
+ do_start();
+ return;
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ if (gtm_is_alive((pid_t) pid))
+ {
+ write_stderr(_("%s: cannot restart server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ write_stderr(_("Please terminate the single-user server and try again.\n"));
+ exit(1);
+ }
+ }
+
+ if (gtm_is_alive((pid_t) pid))
+ {
+ if (kill((pid_t) pid, sig) != 0)
+ {
+ write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+
+ print_msg(_("waiting for server to shut down..."));
+
+ /* always wait for restart */
+
+ for (cnt = 0; cnt < wait_seconds; cnt++)
+ {
+ if ((pid = get_pgpid()) != 0)
+ {
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ else
+ break;
+ }
+
+ if (pid != 0) /* pid file still exists */
+ {
+ print_msg(_(" failed\n"));
+
+ write_stderr(_("%s: server does not shut down\n"), progname);
+ exit(1);
+ }
+
+ print_msg(_(" done\n"));
+ printf(_("server stopped\n"));
+ }
+ else
+ {
+ write_stderr(_("%s: old server process (PID: %ld) seems to be gone\n"),
+ progname, pid);
+ write_stderr(_("starting server anyway\n"));
+ }
+
+ do_start();
+}
+
+
+/*
+ * utility routines
+ */
+
+static bool
+gtm_is_alive(pid_t pid)
+{
+ /*
+ * Test to see if the process is still there. Note that we do not
+ * consider an EPERM failure to mean that the process is still there;
+ * EPERM must mean that the given PID belongs to some other userid, and
+ * considering the permissions on $GTMDATA, that means it's not the
+ * gtm we are after.
+ *
+ * Don't believe that our own PID or parent shell's PID is the gtm,
+ * either. (Windows hasn't got getppid(), though.)
+ */
+ if (pid == getpid())
+ return false;
+#ifndef WIN32
+ if (pid == getppid())
+ return false;
+#endif
+ if (kill(pid, 0) == 0)
+ return true;
+ return false;
+}
+
+static void
+do_advice(void)
+{
+ write_stderr(_("Try \"%s --help\" for more information.\n"), progname);
+}
+
+
+static void
+do_help(void)
+{
+ printf(_("%s is a utility to start, stop or restart,\n"
+ "a GTM server or GTM proxy.\n\n"), progname);
+ printf(_("Usage:\n"));
+ printf(_(" %s start -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
+ printf(_(" %s stop -S STARTUP_MODE [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
+ printf(_(" %s restart -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
+ " [-o \"OPTIONS\"]\n"), progname);
+
+ printf(_("\nCommon options:\n"));
+ printf(_(" -D DATADIR location of the database storage area\n"));
+ printf(_(" -S set gtm or gtm_proxy to launch one of them\n"));
+ printf(_(" -s, only print errors, no informational messages\n"));
+ printf(_(" -t SECS seconds to wait when using -w option\n"));
+ printf(_(" -w wait until operation completes\n"));
+ printf(_(" -W do not wait until operation completes\n"));
+ printf(_(" --help show this help, then exit\n"));
+ printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
+
+ printf(_("\nOptions for start or restart:\n"));
+ printf(_(" -S STARTUP-MODE can be \"gtm\" or \"gtm_proxy\"\n"));
+ printf(_(" -l FILENAME write (or append) server log to FILENAME\n"));
+ printf(_(" -o OPTIONS command line options to pass to gtm\n"
+ " (GTM server executable)\n"));
+ printf(_(" -p PATH-TO-GTM/PROXY path to gtm/gtm_proxy executables\n"));
+ printf(_("\nOptions for stop or restart:\n"));
+ printf(_(" -m SHUTDOWN-MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
+
+ printf(_("\nShutdown modes are:\n"));
+ printf(_(" smart quit after all clients have disconnected\n"));
+ printf(_(" fast quit directly, with proper shutdown\n"));
+ printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
+}
+
+
+static void
+set_mode(char *modeopt)
+{
+ if (strcmp(modeopt, "s") == 0 || strcmp(modeopt, "smart") == 0)
+ {
+ shutdown_mode = SMART_MODE;
+ sig = SIGTERM;
+ }
+ else if (strcmp(modeopt, "f") == 0 || strcmp(modeopt, "fast") == 0)
+ {
+ shutdown_mode = FAST_MODE;
+ sig = SIGINT;
+ }
+ else if (strcmp(modeopt, "i") == 0 || strcmp(modeopt, "immediate") == 0)
+ {
+ shutdown_mode = IMMEDIATE_MODE;
+ sig = SIGQUIT;
+ }
+ else
+ {
+ write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"), progname, modeopt);
+ do_advice();
+ exit(1);
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ int c;
+
+ progname = "gtm_ctl";
+
+ /*
+ * save argv[0] so do_start() can look for the gtm if necessary. we
+ * don't look for gtm here because in many cases we won't need it.
+ */
+ argv0 = argv[0];
+
+ umask(077);
+
+ /* support --help and --version even if invoked as root */
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ||
+ strcmp(argv[1], "-?") == 0)
+ {
+ do_help();
+ exit(0);
+ }
+ }
+
+ /*
+ * Disallow running as root, to forestall any possible security holes.
+ */
+ if (geteuid() == 0)
+ {
+ write_stderr(_("%s: cannot be run as root\n"
+ "Please log in (using, e.g., \"su\") as the "
+ "(unprivileged) user that will\n"
+ "own the server process.\n"),
+ progname);
+ exit(1);
+ }
+
+ /*
+ * 'Action' can be before or after args so loop over both. Some
+ * getopt_long() implementations will reorder argv[] to place all flags
+ * first (GNU?), but we don't rely on it. Our /port version doesn't do
+ * that.
+ */
+ optind = 1;
+
+ /* process command-line options */
+ while (optind < argc)
+ {
+ while ((c = getopt(argc, argv, "D:l:m:o:p:S:t:wW")) != -1)
+ {
+ switch (c)
+ {
+ case 'D':
+ {
+ char *gtmdata_D;
+ char *env_var = pg_malloc(strlen(optarg) + 9);
+
+ gtmdata_D = xstrdup(optarg);
+ canonicalize_path(gtmdata_D);
+ snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s",
+ gtmdata_D);
+ putenv(env_var);
+
+ /*
+ * We could pass GTMDATA just in an environment
+ * variable but we do -D too for clearer gtm
+ * 'ps' display
+ */
+ gtmdata_opt = pg_malloc(strlen(gtmdata_D) + 8);
+ snprintf(gtmdata_opt, strlen(gtmdata_D) + 8,
+ "-D \"%s\" ",
+ gtmdata_D);
+ break;
+ }
+ case 'l':
+ log_file = xstrdup(optarg);
+ break;
+ case 'm':
+ set_mode(optarg);
+ break;
+ case 'o':
+ gtm_opts = xstrdup(optarg);
+ break;
+ case 'p':
+ gtm_path = xstrdup(optarg);
+ canonicalize_path(gtm_path);
+ break;
+ case 'S':
+ gtm_app = xstrdup(optarg);
+ if (strcmp(gtm_app,"gtm_proxy") != 0
+ && strcmp(gtm_app,"gtm") != 0)
+ {
+ write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app);
+ do_advice();
+ exit(1);
+ }
+ break;
+ case 't':
+ wait_seconds = atoi(optarg);
+ break;
+ case 'w':
+ do_wait = true;
+ wait_set = true;
+ break;
+ case 'W':
+ do_wait = false;
+ wait_set = true;
+ break;
+ default:
+ /* getopt_long already issued a suitable error message */
+ do_advice();
+ exit(1);
+ }
+ }
+
+ /* Process an action */
+ if (optind < argc)
+ {
+ if (ctl_command != NO_COMMAND)
+ {
+ write_stderr(_("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind]);
+ do_advice();
+ exit(1);
+ }
+
+ if (strcmp(argv[optind], "start") == 0)
+ ctl_command = START_COMMAND;
+ else if (strcmp(argv[optind], "stop") == 0)
+ ctl_command = STOP_COMMAND;
+ else if (strcmp(argv[optind], "restart") == 0)
+ ctl_command = RESTART_COMMAND;
+ else
+ {
+ write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), progname, argv[optind]);
+ do_advice();
+ exit(1);
+ }
+ optind++;
+ }
+ }
+
+ if (ctl_command == NO_COMMAND)
+ {
+ write_stderr(_("%s: no operation specified\n"), progname);
+ do_advice();
+ exit(1);
+ }
+
+ gtm_data = getenv("GTMDATA");
+
+ if (gtm_data)
+ {
+ gtm_data = xstrdup(gtm_data);
+ canonicalize_path(gtm_data);
+ }
+
+ if (!gtm_data)
+ {
+ write_stderr("%s: no database directory specified \n",
+ progname);
+ do_advice();
+ exit(1);
+ }
+
+ /*
+ * pid files of gtm and gtm proxy are named differently
+ * -S option has also to be set for STOP_COMMAND
+ * or gtm_ctl will not be able to find the correct pid_file
+ */
+ if (!gtm_app)
+ {
+ write_stderr("%s: launcher name non specified, see option -S\n",
+ progname);
+ do_advice();
+ exit(1);
+ }
+
+ if (!wait_set)
+ {
+ switch (ctl_command)
+ {
+ case RESTART_COMMAND:
+ case START_COMMAND:
+ do_wait = false;
+ break;
+ case STOP_COMMAND:
+ do_wait = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (gtm_data)
+ {
+ if (strcmp(gtm_app,"gtm_proxy") == 0)
+ {
+ snprintf(pid_file, MAXPGPATH, "%s/gtm_proxy.pid", gtm_data);
+ snprintf(gtmopts_file, MAXPGPATH, "%s/gtm_proxy.opts", gtm_data);
+ }
+ else if (strcmp(gtm_app,"gtm") == 0)
+ {
+ snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data);
+ snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data);
+ }
+ }
+
+ switch (ctl_command)
+ {
+ case START_COMMAND:
+ do_start();
+ break;
+ case STOP_COMMAND:
+ do_stop();
+ break;
+ case RESTART_COMMAND:
+ do_restart();
+ break;
+ default:
+ break;
+ }
+
+ exit(0);
+}
diff --git a/src/gtm/libpq/Makefile b/src/gtm/libpq/Makefile
new file mode 100644
index 0000000000..9036ba8547
--- /dev/null
+++ b/src/gtm/libpq/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=pqcomm
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+ rm -f $(OBJS)
+ rm -f libpqcomm.so libpqcomm.so.1 libpqcomm.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/libpq/ip.c b/src/gtm/libpq/ip.c
new file mode 100644
index 0000000000..561161410d
--- /dev/null
+++ b/src/gtm/libpq/ip.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.c
+ * IPv6-aware network access.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $
+ *
+ * This file and the IPV6 implementation were initially provided by
+ * Nigel Kukard <[email protected]>, Linux Based Systems Design
+ * http://www.lbsd.net.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* This is intended to be used in both frontend and backend, so use c.h */
+#include "gtm/gtm_c.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#include <sys/file.h>
+
+#include "gtm/ip.h"
+
+
+static int range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+ const struct sockaddr_in * netaddr,
+ const struct sockaddr_in * netmask);
+
+#ifdef HAVE_IPV6
+static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+ const struct sockaddr_in6 * netaddr,
+ const struct sockaddr_in6 * netmask);
+#endif
+
+
+/*
+ * pg_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
+ */
+int
+pg_getaddrinfo_all(const char *hostname, const char *servname,
+ const struct addrinfo * hintp, struct addrinfo ** result)
+{
+ int rc;
+
+ /* not all versions of getaddrinfo() zero *result on failure */
+ *result = NULL;
+
+ /* NULL has special meaning to getaddrinfo(). */
+ rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
+ servname, hintp, result);
+
+ return rc;
+}
+
+
+/*
+ * pg_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix
+ *
+ * Note: the ai_family field of the original hint structure must be passed
+ * so that we can tell whether the addrinfo struct was built by the system's
+ * getaddrinfo() routine or our own getaddrinfo_unix() routine. Some versions
+ * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's
+ * not safe to look at ai_family in the addrinfo itself.
+ */
+void
+pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai)
+{
+ {
+ /* struct was built by getaddrinfo() */
+ if (ai != NULL)
+ freeaddrinfo(ai);
+ }
+}
+
+
+/*
+ * pg_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets
+ *
+ * The API of this routine differs from the standard getnameinfo() definition
+ * in two ways: first, the addr parameter is declared as sockaddr_storage
+ * rather than struct sockaddr, and second, the node and service fields are
+ * guaranteed to be filled with something even on failure return.
+ */
+int
+pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+ char *node, int nodelen,
+ char *service, int servicelen,
+ int flags)
+{
+ int rc;
+
+ rc = getnameinfo((const struct sockaddr *) addr, salen,
+ node, nodelen,
+ service, servicelen,
+ flags);
+
+ if (rc != 0)
+ {
+ if (node)
+ strlcpy(node, "???", nodelen);
+ if (service)
+ strlcpy(service, "???", servicelen);
+ }
+
+ return rc;
+}
+
+/*
+ * pg_range_sockaddr - is addr within the subnet specified by netaddr/netmask ?
+ *
+ * Note: caller must already have verified that all three addresses are
+ * in the same address family; and AF_UNIX addresses are not supported.
+ */
+int
+pg_range_sockaddr(const struct sockaddr_storage * addr,
+ const struct sockaddr_storage * netaddr,
+ const struct sockaddr_storage * netmask)
+{
+ if (addr->ss_family == AF_INET)
+ return range_sockaddr_AF_INET((struct sockaddr_in *) addr,
+ (struct sockaddr_in *) netaddr,
+ (struct sockaddr_in *) netmask);
+#ifdef HAVE_IPV6
+ else if (addr->ss_family == AF_INET6)
+ return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr,
+ (struct sockaddr_in6 *) netaddr,
+ (struct sockaddr_in6 *) netmask);
+#endif
+ else
+ return 0;
+}
+
+static int
+range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+ const struct sockaddr_in * netaddr,
+ const struct sockaddr_in * netmask)
+{
+ if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) &
+ netmask->sin_addr.s_addr) == 0)
+ return 1;
+ else
+ return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+static int
+range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+ const struct sockaddr_in6 * netaddr,
+ const struct sockaddr_in6 * netmask)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) &
+ netmask->sin6_addr.s6_addr[i]) != 0)
+ return 0;
+ }
+
+ return 1;
+}
+#endif /* HAVE_IPV6 */
+
+/*
+ * pg_sockaddr_cidr_mask - make a network mask of the appropriate family
+ * and required number of significant bits
+ *
+ * The resulting mask is placed in *mask, which had better be big enough.
+ *
+ * Return value is 0 if okay, -1 if not.
+ */
+int
+pg_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family)
+{
+ long bits;
+ char *endptr;
+
+ bits = strtol(numbits, &endptr, 10);
+
+ if (*numbits == '\0' || *endptr != '\0')
+ return -1;
+
+ switch (family)
+ {
+ case AF_INET:
+ {
+ struct sockaddr_in mask4;
+ long maskl;
+
+ if (bits < 0 || bits > 32)
+ return -1;
+ /* avoid "x << 32", which is not portable */
+ if (bits > 0)
+ maskl = (0xffffffffUL << (32 - (int) bits))
+ & 0xffffffffUL;
+ else
+ maskl = 0;
+ mask4.sin_addr.s_addr = htonl(maskl);
+ memcpy(mask, &mask4, sizeof(mask4));
+ break;
+ }
+
+#ifdef HAVE_IPV6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 mask6;
+ int i;
+
+ if (bits < 0 || bits > 128)
+ return -1;
+ for (i = 0; i < 16; i++)
+ {
+ if (bits <= 0)
+ mask6.sin6_addr.s6_addr[i] = 0;
+ else if (bits >= 8)
+ mask6.sin6_addr.s6_addr[i] = 0xff;
+ else
+ {
+ mask6.sin6_addr.s6_addr[i] =
+ (0xff << (8 - (int) bits)) & 0xff;
+ }
+ bits -= 8;
+ }
+ memcpy(mask, &mask6, sizeof(mask6));
+ break;
+ }
+#endif
+ default:
+ return -1;
+ }
+
+ mask->ss_family = family;
+ return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+/*
+ * pg_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using
+ * the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result! Note that we only worry about setting the fields
+ * that pg_range_sockaddr will look at.
+ */
+void
+pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ uint32 ip4addr;
+
+ memcpy(&addr4, addr, sizeof(addr4));
+ ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+ memset(&addr6, 0, sizeof(addr6));
+
+ addr6.sin6_family = AF_INET6;
+
+ addr6.sin6_addr.s6_addr[10] = 0xff;
+ addr6.sin6_addr.s6_addr[11] = 0xff;
+ addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+ addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+ addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+ addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+ memcpy(addr, &addr6, sizeof(addr6));
+}
+
+/*
+ * pg_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using
+ * the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * This must be different from pg_promote_v4_to_v6_addr because we want to
+ * set the high-order bits to 1's not 0's.
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result! Note that we only worry about setting the fields
+ * that pg_range_sockaddr will look at.
+ */
+void
+pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ uint32 ip4addr;
+ int i;
+
+ memcpy(&addr4, addr, sizeof(addr4));
+ ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+ memset(&addr6, 0, sizeof(addr6));
+
+ addr6.sin6_family = AF_INET6;
+
+ for (i = 0; i < 12; i++)
+ addr6.sin6_addr.s6_addr[i] = 0xff;
+
+ addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+ addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+ addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+ addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+ memcpy(addr, &addr6, sizeof(addr6));
+}
+
+#endif /* HAVE_IPV6 */
diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c
new file mode 100644
index 0000000000..e697a7f4b1
--- /dev/null
+++ b/src/gtm/libpq/pqcomm.c
@@ -0,0 +1,1130 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqcomm.c
+ * Communication functions between the Frontend and the Backend
+ *
+ * These routines handle the low-level details of communication between
+ * frontend and backend. They just shove data across the communication
+ * channel, and are ignorant of the semantics of the data --- or would be,
+ * except for major brain damage in the design of the old COPY OUT protocol.
+ * Unfortunately, COPY OUT was designed to commandeer the communication
+ * channel (it just transfers data without wrapping it into messages).
+ * No other messages can be sent while COPY OUT is in progress; and if the
+ * copy is aborted by an ereport(ERROR), we need to close out the copy so that
+ * the frontend gets back into sync. Therefore, these routines have to be
+ * aware of COPY OUT state. (New COPY-OUT is message-based and does *not*
+ * set the DoingCopyOut flag.)
+ *
+ * NOTE: generally, it's a bad idea to emit outgoing messages directly with
+ * pq_putbytes(), especially if the message would require multiple calls
+ * to send. Instead, use the routines in pqformat.c to construct the message
+ * in a buffer and then emit it in one call to pq_putmessage. This ensures
+ * that the channel will not be clogged by an incomplete message if execution
+ * is aborted by ereport(ERROR) partway through the message. The only
+ * non-libpq code that should call pq_putbytes directly is old-style COPY OUT.
+ *
+ * At one time, libpq was shared between frontend and backend, but now
+ * the backend's "backend/libpq" is quite separate from "interfaces/libpq".
+ * All that remains is similarities of names to trap the unwary...
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/backend/libpq/pqcomm.c,v 1.198 2008/01/01 19:45:49 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*------------------------
+ * INTERFACE ROUTINES
+ *
+ * setup/teardown:
+ * StreamServerPort - Open postmaster's server port
+ * StreamConnection - Create new connection with client
+ * StreamClose - Close a client/backend connection
+ * TouchSocketFile - Protect socket file against /tmp cleaners
+ * pq_init - initialize libpq at backend startup
+ * pq_comm_reset - reset libpq during error recovery
+ * pq_close - shutdown libpq at backend exit
+ *
+ * low-level I/O:
+ * pq_getbytes - get a known number of bytes from connection
+ * pq_getstring - get a null terminated string from connection
+ * pq_getmessage - get a message with length word from connection
+ * pq_getbyte - get next byte from connection
+ * pq_peekbyte - peek at next byte from connection
+ * pq_putbytes - send bytes to connection (not flushed until pq_flush)
+ * pq_flush - flush pending output
+ *
+ * message-level I/O (and old-style-COPY-OUT cruft):
+ * pq_putmessage - send a normal message (suppressed in COPY OUT mode)
+ * pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
+ * pq_endcopyout - end a COPY OUT transfer
+ *
+ *------------------------
+ */
+
+#include <signal.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#ifdef HAVE_UTIME_H
+#include <utime.h>
+#endif
+
+#include "gtm/gtm_c.h"
+#include "gtm/ip.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/elog.h"
+
+#define MAXGTMPATH 256
+
+/* Where the Unix socket file is */
+static char sock_path[MAXGTMPATH];
+
+static int tcp_keepalives_idle;
+static int tcp_keepalives_interval;
+static int tcp_keepalives_count;
+
+
+/*
+ * Buffers for low-level I/O
+ */
+
+/* Internal functions */
+static int internal_putbytes(Port *myport, const char *s, size_t len);
+static int internal_flush(Port *myport);
+
+/*
+ * Streams -- wrapper around Unix socket system calls
+ *
+ *
+ * Stream functions are used for vanilla TCP connection protocol.
+ */
+
+
+/*
+ * StreamServerPort -- open a "listening" port to accept connections.
+ *
+ * Successfully opened sockets are added to the ListenSocket[] array,
+ * at the first position that isn't -1.
+ *
+ * RETURNS: STATUS_OK or STATUS_ERROR
+ */
+
+int
+StreamServerPort(int family, char *hostName, unsigned short portNumber,
+ int ListenSocket[], int MaxListen)
+{
+ int fd,
+ err;
+ int maxconn;
+ int ret;
+ char portNumberStr[32];
+ const char *familyDesc;
+ char familyDescBuf[64];
+ char *service;
+ struct addrinfo *addrs = NULL,
+ *addr;
+ struct addrinfo hint;
+ int listen_index = 0;
+ int added = 0;
+
+#if !defined(WIN32) || defined(IPV6_V6ONLY)
+ int one = 1;
+#endif
+
+ /* Initialize hint structure */
+ MemSet(&hint, 0, sizeof(hint));
+ hint.ai_family = family;
+ hint.ai_flags = AI_PASSIVE;
+ hint.ai_socktype = SOCK_STREAM;
+
+ {
+ snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber);
+ service = portNumberStr;
+ }
+
+ ret = pg_getaddrinfo_all(hostName, service, &hint, &addrs);
+ if (ret || !addrs)
+ {
+ if (hostName)
+ ereport(LOG,
+ (errmsg("could not translate host name \"%s\", service \"%s\" to address: %s",
+ hostName, service, gai_strerror(ret))));
+ else
+ ereport(LOG,
+ (errmsg("could not translate service \"%s\" to address: %s",
+ service, gai_strerror(ret))));
+ if (addrs)
+ pg_freeaddrinfo_all(hint.ai_family, addrs);
+ return STATUS_ERROR;
+ }
+
+ for (addr = addrs; addr; addr = addr->ai_next)
+ {
+ if (!IS_AF_UNIX(family) && IS_AF_UNIX(addr->ai_family))
+ {
+ /*
+ * Only set up a unix domain socket when they really asked for it.
+ * The service/port is different in that case.
+ */
+ continue;
+ }
+
+ /* See if there is still room to add 1 more socket. */
+ for (; listen_index < MaxListen; listen_index++)
+ {
+ if (ListenSocket[listen_index] == -1)
+ break;
+ }
+ if (listen_index >= MaxListen)
+ {
+ ereport(LOG,
+ (errmsg("could not bind to all requested addresses: MAXLISTEN (%d) exceeded",
+ MaxListen)));
+ break;
+ }
+
+ /* set up family name for possible error messages */
+ switch (addr->ai_family)
+ {
+ case AF_INET:
+ familyDesc = "IPv4";
+ break;
+#ifdef HAVE_IPV6
+ case AF_INET6:
+ familyDesc = "IPv6";
+ break;
+#endif
+ default:
+ snprintf(familyDescBuf, sizeof(familyDescBuf),
+ "unrecognized address family %d",
+ addr->ai_family);
+ familyDesc = familyDescBuf;
+ break;
+ }
+
+ if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0)
+ {
+ ereport(LOG,
+ (EACCES,
+ /* translator: %s is IPv4, IPv6, or Unix */
+ errmsg("could not create %s socket: %m",
+ familyDesc)));
+ continue;
+ }
+
+#ifndef WIN32
+
+ /*
+ * Without the SO_REUSEADDR flag, a new postmaster can't be started
+ * right away after a stop or crash, giving "address already in use"
+ * error on TCP ports.
+ *
+ * On win32, however, this behavior only happens if the
+ * SO_EXLUSIVEADDRUSE is set. With SO_REUSEADDR, win32 allows multiple
+ * servers to listen on the same address, resulting in unpredictable
+ * behavior. With no flags at all, win32 behaves as Unix with
+ * SO_REUSEADDR.
+ */
+ if (!IS_AF_UNIX(addr->ai_family))
+ {
+ if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ (char *) &one, sizeof(one))) == -1)
+ {
+ ereport(LOG,
+ (EACCES,
+ errmsg("setsockopt(SO_REUSEADDR) failed: %m")));
+ close(fd);
+ continue;
+ }
+ }
+#endif
+
+#ifdef IPV6_V6ONLY
+ if (addr->ai_family == AF_INET6)
+ {
+ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
+ (char *) &one, sizeof(one)) == -1)
+ {
+ ereport(LOG,
+ (EACCES,
+ errmsg("setsockopt(IPV6_V6ONLY) failed: %m")));
+ close(fd);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * Note: This might fail on some OS's, like Linux older than
+ * 2.4.21-pre3, that don't have the IPV6_V6ONLY socket option, and map
+ * ipv4 addresses to ipv6. It will show ::ffff:ipv4 for all ipv4
+ * connections.
+ */
+ err = bind(fd, addr->ai_addr, addr->ai_addrlen);
+ if (err < 0)
+ {
+ ereport(LOG,
+ (EACCES,
+ /* translator: %s is IPv4, IPv6, or Unix */
+ errmsg("could not bind %s socket: %m",
+ familyDesc),
+ (IS_AF_UNIX(addr->ai_family)) ?
+ errhint("Is another postmaster already running on port %d?"
+ " If not, remove socket file \"%s\" and retry.",
+ (int) portNumber, sock_path) :
+ errhint("Is another postmaster already running on port %d?"
+ " If not, wait a few seconds and retry.",
+ (int) portNumber)));
+ close(fd);
+ continue;
+ }
+
+#define GTM_MAX_CONNECTIONS 1024
+
+ /*
+ * Select appropriate accept-queue length limit. PG_SOMAXCONN is only
+ * intended to provide a clamp on the request on platforms where an
+ * overly large request provokes a kernel error (are there any?).
+ */
+ maxconn = GTM_MAX_CONNECTIONS * 2;
+
+ err = listen(fd, maxconn);
+ if (err < 0)
+ {
+ ereport(LOG,
+ (EACCES,
+ /* translator: %s is IPv4, IPv6, or Unix */
+ errmsg("could not listen on %s socket: %m",
+ familyDesc)));
+ close(fd);
+ continue;
+ }
+ ListenSocket[listen_index] = fd;
+ added++;
+ }
+
+ pg_freeaddrinfo_all(hint.ai_family, addrs);
+
+ if (!added)
+ return STATUS_ERROR;
+
+ return STATUS_OK;
+}
+
+
+/*
+ * StreamConnection -- create a new connection with client using
+ * server port. Set port->sock to the FD of the new connection.
+ *
+ * ASSUME: that this doesn't need to be non-blocking because
+ * the Postmaster uses select() to tell when the server master
+ * socket is ready for accept().
+ *
+ * RETURNS: STATUS_OK or STATUS_ERROR
+ */
+int
+StreamConnection(int server_fd, Port *port)
+{
+ /* accept connection and fill in the client (remote) address */
+ port->raddr.salen = sizeof(port->raddr.addr);
+ if ((port->sock = accept(server_fd,
+ (struct sockaddr *) & port->raddr.addr,
+ &port->raddr.salen)) < 0)
+ {
+ ereport(LOG,
+ (EACCES,
+ errmsg("could not accept new connection: %m")));
+
+ /*
+ * If accept() fails then postmaster.c will still see the server
+ * socket as read-ready, and will immediately try again. To avoid
+ * uselessly sucking lots of CPU, delay a bit before trying again.
+ * (The most likely reason for failure is being out of kernel file
+ * table slots; we can do little except hope some will get freed up.)
+ */
+ /* pg_usleep(100000L); */ /* wait 0.1 sec */
+ return STATUS_ERROR;
+ }
+
+#ifdef SCO_ACCEPT_BUG
+
+ /*
+ * UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it
+ * shouldn't hurt to catch it for all versions of those platforms.
+ */
+ if (port->raddr.addr.ss_family == 0)
+ port->raddr.addr.ss_family = AF_UNIX;
+#endif
+
+ /* fill in the server (local) address */
+ port->laddr.salen = sizeof(port->laddr.addr);
+ if (getsockname(port->sock,
+ (struct sockaddr *) & port->laddr.addr,
+ &port->laddr.salen) < 0)
+ {
+ elog(LOG, "getsockname() failed: %m");
+ return STATUS_ERROR;
+ }
+
+ /* select NODELAY and KEEPALIVE options if it's a TCP connection */
+ if (!IS_AF_UNIX(port->laddr.addr.ss_family))
+ {
+ int on;
+
+#ifdef TCP_NODELAY
+ on = 1;
+ if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY,
+ (char *) &on, sizeof(on)) < 0)
+ {
+ elog(LOG, "setsockopt(TCP_NODELAY) failed: %m");
+ return STATUS_ERROR;
+ }
+#endif
+ on = 1;
+ if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *) &on, sizeof(on)) < 0)
+ {
+ elog(LOG, "setsockopt(SO_KEEPALIVE) failed: %m");
+ return STATUS_ERROR;
+ }
+
+ /*
+ * Also apply the current keepalive parameters. If we fail to set a
+ * parameter, don't error out, because these aren't universally
+ * supported. (Note: you might think we need to reset the GUC
+ * variables to 0 in such a case, but it's not necessary because the
+ * show hooks for these variables report the truth anyway.)
+ */
+ (void) pq_setkeepalivesidle(tcp_keepalives_idle, port);
+ (void) pq_setkeepalivesinterval(tcp_keepalives_interval, port);
+ (void) pq_setkeepalivescount(tcp_keepalives_count, port);
+ }
+
+ return STATUS_OK;
+}
+
+/*
+ * StreamClose -- close a client/backend connection
+ *
+ * NOTE: this is NOT used to terminate a session; it is just used to release
+ * the file descriptor in a process that should no longer have the socket
+ * open. (For example, the postmaster calls this after passing ownership
+ * of the connection to a child process.) It is expected that someone else
+ * still has the socket open. So, we only want to close the descriptor,
+ * we do NOT want to send anything to the far end.
+ */
+void
+StreamClose(int sock)
+{
+ close(sock);
+}
+
+/*
+ * TouchSocketFile -- mark socket file as recently accessed
+ *
+ * This routine should be called every so often to ensure that the socket
+ * file has a recent mod date (ordinary operations on sockets usually won't
+ * change the mod date). That saves it from being removed by
+ * overenthusiastic /tmp-directory-cleaner daemons. (Another reason we should
+ * never have put the socket file in /tmp...)
+ */
+void
+TouchSocketFile(void)
+{
+ /* Do nothing if we did not create a socket... */
+ if (sock_path[0] != '\0')
+ {
+ /*
+ * utime() is POSIX standard, utimes() is a common alternative. If we
+ * have neither, there's no way to affect the mod or access time of
+ * the socket :-(
+ *
+ * In either path, we ignore errors; there's no point in complaining.
+ */
+#ifdef HAVE_UTIME
+ utime(sock_path, NULL);
+#else /* !HAVE_UTIME */
+#ifdef HAVE_UTIMES
+ utimes(sock_path, NULL);
+#endif /* HAVE_UTIMES */
+#endif /* HAVE_UTIME */
+ }
+}
+
+
+/* --------------------------------
+ * Low-level I/O routines begin here.
+ *
+ * These routines communicate with a frontend client across a connection
+ * already established by the preceding routines.
+ * --------------------------------
+ */
+
+
+/* --------------------------------
+ * pq_recvbuf - load some bytes into the input buffer
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pq_recvbuf(Port *myport)
+{
+ if (myport->PqRecvPointer > 0)
+ {
+ if (myport->PqRecvLength > myport->PqRecvPointer)
+ {
+ /* still some unread data, left-justify it in the buffer */
+ memmove(myport->PqRecvBuffer, myport->PqRecvBuffer + myport->PqRecvPointer,
+ myport->PqRecvLength - myport->PqRecvPointer);
+ myport->PqRecvLength -= myport->PqRecvPointer;
+ myport->PqRecvPointer = 0;
+ }
+ else
+ myport->PqRecvLength = myport->PqRecvPointer = 0;
+ }
+
+ /* Can fill buffer from myport->PqRecvLength and upwards */
+ for (;;)
+ {
+ int r;
+
+ r = recv(myport->sock, myport->PqRecvBuffer + myport->PqRecvLength,
+ PQ_BUFFER_SIZE - myport->PqRecvLength, 0);
+
+ if (r < 0)
+ {
+ if (errno == EINTR)
+ continue; /* Ok if interrupted */
+
+ /*
+ * Careful: an ereport() that tries to write to the client would
+ * cause recursion to here, leading to stack overflow and core
+ * dump! This message must go *only* to the postmaster log.
+ */
+ ereport(COMMERROR,
+ (EACCES,
+ errmsg("could not receive data from client: %m")));
+ return EOF;
+ }
+ if (r == 0)
+ {
+ /*
+ * EOF detected. We used to write a log message here, but it's
+ * better to expect the ultimate caller to do that.
+ */
+ return EOF;
+ }
+ /* r contains number of bytes read, so just incr length */
+ myport->PqRecvLength += r;
+ return 0;
+ }
+}
+
+/* --------------------------------
+ * pq_getbyte - get a single byte from connection, or return EOF
+ * --------------------------------
+ */
+int
+pq_getbyte(Port *myport)
+{
+ while (myport->PqRecvPointer >= myport->PqRecvLength)
+ {
+ if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+ return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer++];
+}
+
+/* --------------------------------
+ * pq_peekbyte - peek at next byte from connection
+ *
+ * Same as pq_getbyte() except we don't advance the pointer.
+ * --------------------------------
+ */
+int
+pq_peekbyte(Port *myport)
+{
+ while (myport->PqRecvPointer >= myport->PqRecvLength)
+ {
+ if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+ return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer];
+}
+
+/* --------------------------------
+ * pq_getbytes - get a known number of bytes from connection
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getbytes(Port *myport, char *s, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ while (myport->PqRecvPointer >= myport->PqRecvLength)
+ {
+ if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+ amount = myport->PqRecvLength - myport->PqRecvPointer;
+ if (amount > len)
+ amount = len;
+ memcpy(s, myport->PqRecvBuffer + myport->PqRecvPointer, amount);
+ myport->PqRecvPointer += amount;
+ s += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+/* --------------------------------
+ * pq_discardbytes - throw away a known number of bytes
+ *
+ * same as pq_getbytes except we do not copy the data to anyplace.
+ * this is used for resynchronizing after read errors.
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pq_discardbytes(Port *myport, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ while (myport->PqRecvPointer >= myport->PqRecvLength)
+ {
+ if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+ amount = myport->PqRecvLength - myport->PqRecvPointer;
+ if (amount > len)
+ amount = len;
+ myport->PqRecvPointer += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+/* --------------------------------
+ * pq_getstring - get a null terminated string from connection
+ *
+ * The return value is placed in an expansible StringInfo, which has
+ * already been initialized by the caller.
+ *
+ * This is used only for dealing with old-protocol clients. The idea
+ * is to produce a StringInfo that looks the same as we would get from
+ * pq_getmessage() with a newer client; we will then process it with
+ * pq_getmsgstring. Therefore, no character set conversion is done here,
+ * even though this is presumably useful only for text.
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getstring(Port *myport, StringInfo s)
+{
+ int i;
+
+ resetStringInfo(s);
+
+ /* Read until we get the terminating '\0' */
+ for (;;)
+ {
+ while (myport->PqRecvPointer >= myport->PqRecvLength)
+ {
+ if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */
+ return EOF; /* Failed to recv data */
+ }
+
+ for (i = myport->PqRecvPointer; i < myport->PqRecvLength; i++)
+ {
+ if (myport->PqRecvBuffer[i] == '\0')
+ {
+ /* include the '\0' in the copy */
+ appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer,
+ i - myport->PqRecvPointer + 1);
+ myport->PqRecvPointer = i + 1; /* advance past \0 */
+ return 0;
+ }
+ }
+
+ /* If we're here we haven't got the \0 in the buffer yet. */
+ appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer,
+ myport->PqRecvLength - myport->PqRecvPointer);
+ myport->PqRecvPointer = myport->PqRecvLength;
+ }
+}
+
+
+/* --------------------------------
+ * pq_getmessage - get a message with length word from connection
+ *
+ * The return value is placed in an expansible StringInfo, which has
+ * already been initialized by the caller.
+ * Only the message body is placed in the StringInfo; the length word
+ * is removed. Also, s->cursor is initialized to zero for convenience
+ * in scanning the message contents.
+ *
+ * If maxlen is not zero, it is an upper limit on the length of the
+ * message we are willing to accept. We abort the connection (by
+ * returning EOF) if client tries to send more than that.
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getmessage(Port *myport, StringInfo s, int maxlen)
+{
+ int32 len;
+
+ resetStringInfo(s);
+
+ /* Read message length word */
+ if (pq_getbytes(myport, (char *) &len, 4) == EOF)
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("unexpected EOF within message length word")));
+ return EOF;
+ }
+
+ len = ntohl(len);
+
+ if (len < 4 ||
+ (maxlen > 0 && len > maxlen))
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("invalid message length")));
+ return EOF;
+ }
+
+ len -= 4; /* discount length itself */
+
+ if (len > 0)
+ {
+ /*
+ * Allocate space for message. If we run out of room (ridiculously
+ * large message), we will elog(ERROR), but we want to discard the
+ * message body so as not to lose communication sync.
+ */
+ enlargeStringInfo(s, len);
+
+ /* And grab the message */
+ if (pq_getbytes(myport, s->data, len) == EOF)
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("incomplete message from client")));
+ return EOF;
+ }
+ s->len = len;
+ /* Place a trailing null per StringInfo convention */
+ s->data[len] = '\0';
+ }
+
+ return 0;
+}
+
+
+/* --------------------------------
+ * pq_putbytes - send bytes to connection (not flushed until pq_flush)
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_putbytes(Port *myport, const char *s, size_t len)
+{
+ int res;
+
+ res = internal_putbytes(myport, s, len);
+ return res;
+}
+
+static int
+internal_putbytes(Port *myport, const char *s, size_t len)
+{
+ size_t amount;
+
+ while (len > 0)
+ {
+ /* If buffer is full, then flush it out */
+ if (myport->PqSendPointer >= PQ_BUFFER_SIZE)
+ if (internal_flush(myport))
+ return EOF;
+ amount = PQ_BUFFER_SIZE - myport->PqSendPointer;
+ if (amount > len)
+ amount = len;
+ memcpy(myport->PqSendBuffer + myport->PqSendPointer, s, amount);
+ myport->PqSendPointer += amount;
+ s += amount;
+ len -= amount;
+ }
+ return 0;
+}
+
+/* --------------------------------
+ * pq_flush - flush pending output
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_flush(Port *myport)
+{
+ int res;
+
+ /* No-op if reentrant call */
+ res = internal_flush(myport);
+ return res;
+}
+
+static int
+internal_flush(Port *myport)
+{
+ static int last_reported_send_errno = 0;
+
+ char *bufptr = myport->PqSendBuffer;
+ char *bufend = myport->PqSendBuffer + myport->PqSendPointer;
+
+ while (bufptr < bufend)
+ {
+ int r;
+
+ r = send(myport->sock, bufptr, bufend - bufptr, 0);
+
+ if (r <= 0)
+ {
+ if (errno == EINTR)
+ continue; /* Ok if we were interrupted */
+
+ /*
+ * Careful: an ereport() that tries to write to the client would
+ * cause recursion to here, leading to stack overflow and core
+ * dump! This message must go *only* to the postmaster log.
+ *
+ * If a client disconnects while we're in the midst of output, we
+ * might write quite a bit of data before we get to a safe query
+ * abort point. So, suppress duplicate log messages.
+ */
+ if (errno != last_reported_send_errno)
+ {
+ last_reported_send_errno = errno;
+ ereport(COMMERROR,
+ (EACCES,
+ errmsg("could not send data to client: %m")));
+ }
+
+ /*
+ * We drop the buffered data anyway so that processing can
+ * continue, even though we'll probably quit soon.
+ */
+ myport->PqSendPointer = 0;
+ return EOF;
+ }
+
+ last_reported_send_errno = 0; /* reset after any successful send */
+ bufptr += r;
+ }
+
+ myport->PqSendPointer = 0;
+ return 0;
+}
+
+
+/* --------------------------------
+ * Message-level I/O routines begin here.
+ *
+ * These routines understand about the old-style COPY OUT protocol.
+ * --------------------------------
+ */
+
+
+/* --------------------------------
+ * pq_putmessage - send a normal message (suppressed in COPY OUT mode)
+ *
+ * If msgtype is not '\0', it is a message type code to place before
+ * the message body. If msgtype is '\0', then the message has no type
+ * code (this is only valid in pre-3.0 protocols).
+ *
+ * len is the length of the message body data at *s. In protocol 3.0
+ * and later, a message length word (equal to len+4 because it counts
+ * itself too) is inserted by this routine.
+ *
+ * All normal messages are suppressed while old-style COPY OUT is in
+ * progress. (In practice only a few notice messages might get emitted
+ * then; dropping them is annoying, but at least they will still appear
+ * in the postmaster log.)
+ *
+ * We also suppress messages generated while pqcomm.c is busy. This
+ * avoids any possibility of messages being inserted within other
+ * messages. The only known trouble case arises if SIGQUIT occurs
+ * during a pqcomm.c routine --- quickdie() will try to send a warning
+ * message, and the most reasonable approach seems to be to drop it.
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_putmessage(Port *myport, char msgtype, const char *s, size_t len)
+{
+ uint32 n32;
+ if (msgtype)
+ if (internal_putbytes(myport, &msgtype, 1))
+ goto fail;
+
+ n32 = htonl((uint32) (len + 4));
+ if (internal_putbytes(myport, (char *) &n32, 4))
+ goto fail;
+
+ if (internal_putbytes(myport, s, len))
+ goto fail;
+ return 0;
+
+fail:
+ return EOF;
+}
+
+
+/*
+ * Support for TCP Keepalive parameters
+ */
+
+int
+pq_getkeepalivesidle(Port *port)
+{
+#ifdef TCP_KEEPIDLE
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return 0;
+
+ if (port->keepalives_idle != 0)
+ return port->keepalives_idle;
+
+ if (port->default_keepalives_idle == 0)
+ {
+ ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_idle);
+
+ if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *) &port->default_keepalives_idle,
+ &size) < 0)
+ {
+ elog(LOG, "getsockopt(TCP_KEEPIDLE) failed: %m");
+ port->default_keepalives_idle = -1; /* don't know */
+ }
+ }
+
+ return port->default_keepalives_idle;
+#else
+ return 0;
+#endif
+}
+
+int
+pq_setkeepalivesidle(int idle, Port *port)
+{
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return STATUS_OK;
+
+#ifdef TCP_KEEPIDLE
+ if (idle == port->keepalives_idle)
+ return STATUS_OK;
+
+ if (port->default_keepalives_idle <= 0)
+ {
+ if (pq_getkeepalivesidle(port) < 0)
+ {
+ if (idle == 0)
+ return STATUS_OK; /* default is set but unknown */
+ else
+ return STATUS_ERROR;
+ }
+ }
+
+ if (idle == 0)
+ idle = port->default_keepalives_idle;
+
+ if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *) &idle, sizeof(idle)) < 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPIDLE) failed: %m");
+ return STATUS_ERROR;
+ }
+
+ port->keepalives_idle = idle;
+#else
+ if (idle != 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPIDLE) not supported");
+ return STATUS_ERROR;
+ }
+#endif
+
+ return STATUS_OK;
+}
+
+int
+pq_getkeepalivesinterval(Port *port)
+{
+#ifdef TCP_KEEPINTVL
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return 0;
+
+ if (port->keepalives_interval != 0)
+ return port->keepalives_interval;
+
+ if (port->default_keepalives_interval == 0)
+ {
+ ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_interval);
+
+ if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *) &port->default_keepalives_interval,
+ &size) < 0)
+ {
+ elog(LOG, "getsockopt(TCP_KEEPINTVL) failed: %m");
+ port->default_keepalives_interval = -1; /* don't know */
+ }
+ }
+
+ return port->default_keepalives_interval;
+#else
+ return 0;
+#endif
+}
+
+int
+pq_setkeepalivesinterval(int interval, Port *port)
+{
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return STATUS_OK;
+
+#ifdef TCP_KEEPINTVL
+ if (interval == port->keepalives_interval)
+ return STATUS_OK;
+
+ if (port->default_keepalives_interval <= 0)
+ {
+ if (pq_getkeepalivesinterval(port) < 0)
+ {
+ if (interval == 0)
+ return STATUS_OK; /* default is set but unknown */
+ else
+ return STATUS_ERROR;
+ }
+ }
+
+ if (interval == 0)
+ interval = port->default_keepalives_interval;
+
+ if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *) &interval, sizeof(interval)) < 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPINTVL) failed: %m");
+ return STATUS_ERROR;
+ }
+
+ port->keepalives_interval = interval;
+#else
+ if (interval != 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPINTVL) not supported");
+ return STATUS_ERROR;
+ }
+#endif
+
+ return STATUS_OK;
+}
+
+int
+pq_getkeepalivescount(Port *port)
+{
+#ifdef TCP_KEEPCNT
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return 0;
+
+ if (port->keepalives_count != 0)
+ return port->keepalives_count;
+
+ if (port->default_keepalives_count == 0)
+ {
+ ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_count);
+
+ if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *) &port->default_keepalives_count,
+ &size) < 0)
+ {
+ elog(LOG, "getsockopt(TCP_KEEPCNT) failed: %m");
+ port->default_keepalives_count = -1; /* don't know */
+ }
+ }
+
+ return port->default_keepalives_count;
+#else
+ return 0;
+#endif
+}
+
+int
+pq_setkeepalivescount(int count, Port *port)
+{
+ if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ return STATUS_OK;
+
+#ifdef TCP_KEEPCNT
+ if (count == port->keepalives_count)
+ return STATUS_OK;
+
+ if (port->default_keepalives_count <= 0)
+ {
+ if (pq_getkeepalivescount(port) < 0)
+ {
+ if (count == 0)
+ return STATUS_OK; /* default is set but unknown */
+ else
+ return STATUS_ERROR;
+ }
+ }
+
+ if (count == 0)
+ count = port->default_keepalives_count;
+
+ if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *) &count, sizeof(count)) < 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPCNT) failed: %m");
+ return STATUS_ERROR;
+ }
+
+ port->keepalives_count = count;
+#else
+ if (count != 0)
+ {
+ elog(LOG, "setsockopt(TCP_KEEPCNT) not supported");
+ return STATUS_ERROR;
+ }
+#endif
+
+ return STATUS_OK;
+}
diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c
new file mode 100644
index 0000000000..339f50a995
--- /dev/null
+++ b/src/gtm/libpq/pqformat.c
@@ -0,0 +1,658 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqformat.c
+ * Routines for formatting and parsing frontend/backend messages
+ *
+ * Outgoing messages are built up in a StringInfo buffer (which is expansible)
+ * and then sent in a single call to pq_putmessage. This module provides data
+ * formatting/conversion routines that are needed to produce valid messages.
+ * Note in particular the distinction between "raw data" and "text"; raw data
+ * is message protocol characters and binary values that are not subject to
+ * character set conversion, while text is converted by character encoding
+ * rules.
+ *
+ * Incoming messages are similarly read into a StringInfo buffer, via
+ * pq_getmessage, and then parsed and converted from that using the routines
+ * in this module.
+ *
+ * These same routines support reading and writing of external binary formats
+ * (typsend/typreceive routines). The conversion routines for individual
+ * data types are exactly the same, only initialization and completion
+ * are different.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/backend/libpq/pqformat.c,v 1.48 2009/01/01 17:23:42 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * Message assembly and output:
+ * pq_beginmessage - initialize StringInfo buffer
+ * pq_sendbyte - append a raw byte to a StringInfo buffer
+ * pq_sendint - append a binary integer to a StringInfo buffer
+ * pq_sendint64 - append a binary 8-byte int to a StringInfo buffer
+ * pq_sendfloat4 - append a float4 to a StringInfo buffer
+ * pq_sendfloat8 - append a float8 to a StringInfo buffer
+ * pq_sendbytes - append raw data to a StringInfo buffer
+ * pq_sendcountedtext - append a counted text string (with character set conversion)
+ * pq_sendtext - append a text string (with conversion)
+ * pq_sendstring - append a null-terminated text string (with conversion)
+ * pq_send_ascii_string - append a null-terminated text string (without conversion)
+ * pq_endmessage - send the completed message to the frontend
+ * Note: it is also possible to append data to the StringInfo buffer using
+ * the regular StringInfo routines, but this is discouraged since required
+ * character set conversion may not occur.
+ *
+ * typsend support (construct a bytea value containing external binary data):
+ * pq_begintypsend - initialize StringInfo buffer
+ * pq_endtypsend - return the completed string as a "bytea*"
+ *
+ * Special-case message output:
+ * pq_puttextmessage - generate a character set-converted message in one step
+ * pq_putemptymessage - convenience routine for message with empty body
+ *
+ * Message parsing after input:
+ * pq_getmsgbyte - get a raw byte from a message buffer
+ * pq_getmsgint - get a binary integer from a message buffer
+ * pq_getmsgint64 - get a binary 8-byte int from a message buffer
+ * pq_getmsgfloat4 - get a float4 from a message buffer
+ * pq_getmsgfloat8 - get a float8 from a message buffer
+ * pq_getmsgbytes - get raw data from a message buffer
+ * pq_copymsgbytes - copy raw data from a message buffer
+ * pq_getmsgtext - get a counted text string (with conversion)
+ * pq_getmsgstring - get a null-terminated text string (with conversion)
+ * pq_getmsgend - verify message fully consumed
+ * pq_getmsgunreadlen - get length of the unread data in the message buffer
+ */
+
+#include <sys/param.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+
+
+/* --------------------------------
+ * pq_beginmessage - initialize for sending a message
+ * --------------------------------
+ */
+void
+pq_beginmessage(StringInfo buf, char msgtype)
+{
+ initStringInfo(buf);
+
+ /*
+ * We stash the message type into the buffer's cursor field, expecting
+ * that the pq_sendXXX routines won't touch it. We could alternatively
+ * make it the first byte of the buffer contents, but this seems easier.
+ */
+ buf->cursor = msgtype;
+}
+
+/* --------------------------------
+ * pq_sendbyte - append a raw byte to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbyte(StringInfo buf, int byt)
+{
+ appendStringInfoCharMacro(buf, byt);
+}
+
+/* --------------------------------
+ * pq_sendbytes - append raw data to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbytes(StringInfo buf, const char *data, int datalen)
+{
+ appendBinaryStringInfo(buf, data, datalen);
+}
+
+/* --------------------------------
+ * pq_sendcountedtext - append a counted text string (with character set conversion)
+ *
+ * The data sent to the frontend by this routine is a 4-byte count field
+ * followed by the string. The count includes itself or not, as per the
+ * countincludesself flag (pre-3.0 protocol requires it to include itself).
+ * The passed text string need not be null-terminated, and the data sent
+ * to the frontend isn't either.
+ * --------------------------------
+ */
+void
+pq_sendcountedtext(StringInfo buf, const char *str, int slen,
+ bool countincludesself)
+{
+ int extra = countincludesself ? 4 : 0;
+ char *p;
+
+ if (p != str) /* actual conversion has been done? */
+ {
+ slen = strlen(p);
+ pq_sendint(buf, slen + extra, 4);
+ appendBinaryStringInfo(buf, p, slen);
+ pfree(p);
+ }
+ else
+ {
+ pq_sendint(buf, slen + extra, 4);
+ appendBinaryStringInfo(buf, str, slen);
+ }
+}
+
+/* --------------------------------
+ * pq_sendtext - append a text string (with conversion)
+ *
+ * The passed text string need not be null-terminated, and the data sent
+ * to the frontend isn't either. Note that this is not actually useful
+ * for direct frontend transmissions, since there'd be no way for the
+ * frontend to determine the string length. But it is useful for binary
+ * format conversions.
+ * --------------------------------
+ */
+void
+pq_sendtext(StringInfo buf, const char *str, int slen)
+{
+ char *p;
+
+ if (p != str) /* actual conversion has been done? */
+ {
+ slen = strlen(p);
+ appendBinaryStringInfo(buf, p, slen);
+ pfree(p);
+ }
+ else
+ appendBinaryStringInfo(buf, str, slen);
+}
+
+/* --------------------------------
+ * pq_sendstring - append a null-terminated text string (with conversion)
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_sendstring(StringInfo buf, const char *str)
+{
+ int slen = strlen(str);
+ appendBinaryStringInfo(buf, str, slen + 1);
+}
+
+/* --------------------------------
+ * pq_send_ascii_string - append a null-terminated text string (without conversion)
+ *
+ * This function intentionally bypasses encoding conversion, instead just
+ * silently replacing any non-7-bit-ASCII characters with question marks.
+ * It is used only when we are having trouble sending an error message to
+ * the client with normal localization and encoding conversion. The caller
+ * should already have taken measures to ensure the string is just ASCII;
+ * the extra work here is just to make certain we don't send a badly encoded
+ * string to the client (which might or might not be robust about that).
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_send_ascii_string(StringInfo buf, const char *str)
+{
+ while (*str)
+ {
+ char ch = *str++;
+
+ if (IS_HIGHBIT_SET(ch))
+ ch = '?';
+ appendStringInfoCharMacro(buf, ch);
+ }
+ appendStringInfoChar(buf, '\0');
+}
+
+/* --------------------------------
+ * pq_sendint - append a binary integer to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendint(StringInfo buf, int i, int b)
+{
+ unsigned char n8;
+ uint16 n16;
+ uint32 n32;
+
+ switch (b)
+ {
+ case 1:
+ n8 = (unsigned char) i;
+ appendBinaryStringInfo(buf, (char *) &n8, 1);
+ break;
+ case 2:
+ n16 = htons((uint16) i);
+ appendBinaryStringInfo(buf, (char *) &n16, 2);
+ break;
+ case 4:
+ n32 = htonl((uint32) i);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+ break;
+ default:
+ elog(ERROR, "unsupported integer size %d", b);
+ break;
+ }
+}
+
+/* --------------------------------
+ * pq_sendint64 - append a binary 8-byte int to a StringInfo buffer
+ *
+ * It is tempting to merge this with pq_sendint, but we'd have to make the
+ * argument int64 for all data widths --- that could be a big performance
+ * hit on machines where int64 isn't efficient.
+ * --------------------------------
+ */
+void
+pq_sendint64(StringInfo buf, int64 i)
+{
+ uint32 n32;
+
+ /* High order half first, since we're doing MSB-first */
+#ifdef INT64_IS_BUSTED
+ /* don't try a right shift of 32 on a 32-bit word */
+ n32 = (i < 0) ? -1 : 0;
+#else
+ n32 = (uint32) (i >> 32);
+#endif
+ n32 = htonl(n32);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+
+ /* Now the low order half */
+ n32 = (uint32) i;
+ n32 = htonl(n32);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+}
+
+/* --------------------------------
+ * pq_sendfloat4 - append a float4 to a StringInfo buffer
+ *
+ * The point of this routine is to localize knowledge of the external binary
+ * representation of float4, which is a component of several datatypes.
+ *
+ * We currently assume that float4 should be byte-swapped in the same way
+ * as int4. This rule is not perfect but it gives us portability across
+ * most IEEE-float-using architectures.
+ * --------------------------------
+ */
+void
+pq_sendfloat4(StringInfo buf, float4 f)
+{
+ union
+ {
+ float4 f;
+ uint32 i;
+ } swap;
+
+ swap.f = f;
+ swap.i = htonl(swap.i);
+
+ appendBinaryStringInfo(buf, (char *) &swap.i, 4);
+}
+
+/* --------------------------------
+ * pq_sendfloat8 - append a float8 to a StringInfo buffer
+ *
+ * The point of this routine is to localize knowledge of the external binary
+ * representation of float8, which is a component of several datatypes.
+ *
+ * We currently assume that float8 should be byte-swapped in the same way
+ * as int8. This rule is not perfect but it gives us portability across
+ * most IEEE-float-using architectures.
+ * --------------------------------
+ */
+void
+pq_sendfloat8(StringInfo buf, float8 f)
+{
+#ifdef INT64_IS_BUSTED
+ union
+ {
+ float8 f;
+ uint32 h[2];
+ } swap;
+
+ swap.f = f;
+ swap.h[0] = htonl(swap.h[0]);
+ swap.h[1] = htonl(swap.h[1]);
+
+#ifdef WORDS_BIGENDIAN
+ /* machine seems to be big-endian, send h[0] first */
+ appendBinaryStringInfo(buf, (char *) &swap.h[0], 4);
+ appendBinaryStringInfo(buf, (char *) &swap.h[1], 4);
+#else
+ /* machine seems to be little-endian, send h[1] first */
+ appendBinaryStringInfo(buf, (char *) &swap.h[1], 4);
+ appendBinaryStringInfo(buf, (char *) &swap.h[0], 4);
+#endif
+#else /* INT64 works */
+ union
+ {
+ float8 f;
+ int64 i;
+ } swap;
+
+ swap.f = f;
+ pq_sendint64(buf, swap.i);
+#endif
+}
+
+/* --------------------------------
+ * pq_endmessage - send the completed message to the frontend
+ *
+ * The data buffer is pfree()d, but if the StringInfo was allocated with
+ * makeStringInfo then the caller must still pfree it.
+ * --------------------------------
+ */
+void
+pq_endmessage(Port *myport, StringInfo buf)
+{
+ /* msgtype was saved in cursor field */
+ (void) pq_putmessage(myport, buf->cursor, buf->data, buf->len);
+ /* no need to complain about any failure, since pqcomm.c already did */
+ pfree(buf->data);
+ buf->data = NULL;
+}
+
+
+/* --------------------------------
+ * pq_puttextmessage - generate a character set-converted message in one step
+ *
+ * This is the same as the pqcomm.c routine pq_putmessage, except that
+ * the message body is a null-terminated string to which encoding
+ * conversion applies.
+ * --------------------------------
+ */
+void
+pq_puttextmessage(Port *myport, char msgtype, const char *str)
+{
+ int slen = strlen(str);
+ (void) pq_putmessage(myport, msgtype, str, slen + 1);
+}
+
+
+/* --------------------------------
+ * pq_putemptymessage - convenience routine for message with empty body
+ * --------------------------------
+ */
+void
+pq_putemptymessage(Port *myport, char msgtype)
+{
+ (void) pq_putmessage(myport, msgtype, NULL, 0);
+}
+
+
+/* --------------------------------
+ * pq_getmsgbyte - get a raw byte from a message buffer
+ * --------------------------------
+ */
+int
+pq_getmsgbyte(StringInfo msg)
+{
+ if (msg->cursor >= msg->len)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("no data left in message")));
+ return (unsigned char) msg->data[msg->cursor++];
+}
+
+/* --------------------------------
+ * pq_getmsgint - get a binary integer from a message buffer
+ *
+ * Values are treated as unsigned.
+ * --------------------------------
+ */
+unsigned int
+pq_getmsgint(StringInfo msg, int b)
+{
+ unsigned int result;
+ unsigned char n8;
+ uint16 n16;
+ uint32 n32;
+
+ switch (b)
+ {
+ case 1:
+ pq_copymsgbytes(msg, (char *) &n8, 1);
+ result = n8;
+ break;
+ case 2:
+ pq_copymsgbytes(msg, (char *) &n16, 2);
+ result = ntohs(n16);
+ break;
+ case 4:
+ pq_copymsgbytes(msg, (char *) &n32, 4);
+ result = ntohl(n32);
+ break;
+ default:
+ elog(ERROR, "unsupported integer size %d", b);
+ result = 0; /* keep compiler quiet */
+ break;
+ }
+ return result;
+}
+
+/* --------------------------------
+ * pq_getmsgint64 - get a binary 8-byte int from a message buffer
+ *
+ * It is tempting to merge this with pq_getmsgint, but we'd have to make the
+ * result int64 for all data widths --- that could be a big performance
+ * hit on machines where int64 isn't efficient.
+ * --------------------------------
+ */
+int64
+pq_getmsgint64(StringInfo msg)
+{
+ int64 result;
+ uint32 h32;
+ uint32 l32;
+
+ pq_copymsgbytes(msg, (char *) &h32, 4);
+ pq_copymsgbytes(msg, (char *) &l32, 4);
+ h32 = ntohl(h32);
+ l32 = ntohl(l32);
+
+#ifdef INT64_IS_BUSTED
+ /* error out if incoming value is wider than 32 bits */
+ result = l32;
+ if ((result < 0) ? (h32 != -1) : (h32 != 0))
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("binary value is out of range for type bigint")));
+#else
+ result = h32;
+ result <<= 32;
+ result |= l32;
+#endif
+
+ return result;
+}
+
+/* --------------------------------
+ * pq_getmsgfloat4 - get a float4 from a message buffer
+ *
+ * See notes for pq_sendfloat4.
+ * --------------------------------
+ */
+float4
+pq_getmsgfloat4(StringInfo msg)
+{
+ union
+ {
+ float4 f;
+ uint32 i;
+ } swap;
+
+ swap.i = pq_getmsgint(msg, 4);
+ return swap.f;
+}
+
+/* --------------------------------
+ * pq_getmsgfloat8 - get a float8 from a message buffer
+ *
+ * See notes for pq_sendfloat8.
+ * --------------------------------
+ */
+float8
+pq_getmsgfloat8(StringInfo msg)
+{
+#ifdef INT64_IS_BUSTED
+ union
+ {
+ float8 f;
+ uint32 h[2];
+ } swap;
+
+#ifdef WORDS_BIGENDIAN
+ /* machine seems to be big-endian, receive h[0] first */
+ swap.h[0] = pq_getmsgint(msg, 4);
+ swap.h[1] = pq_getmsgint(msg, 4);
+#else
+ /* machine seems to be little-endian, receive h[1] first */
+ swap.h[1] = pq_getmsgint(msg, 4);
+ swap.h[0] = pq_getmsgint(msg, 4);
+#endif
+ return swap.f;
+#else /* INT64 works */
+ union
+ {
+ float8 f;
+ int64 i;
+ } swap;
+
+ swap.i = pq_getmsgint64(msg);
+ return swap.f;
+#endif
+}
+
+/* --------------------------------
+ * pq_getmsgbytes - get raw data from a message buffer
+ *
+ * Returns a pointer directly into the message buffer; note this
+ * may not have any particular alignment.
+ * --------------------------------
+ */
+const char *
+pq_getmsgbytes(StringInfo msg, int datalen)
+{
+ const char *result;
+
+ if (datalen < 0 || datalen > (msg->len - msg->cursor))
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("insufficient data left in message")));
+ result = &msg->data[msg->cursor];
+ msg->cursor += datalen;
+ return result;
+}
+
+/* --------------------------------
+ * pq_copymsgbytes - copy raw data from a message buffer
+ *
+ * Same as above, except data is copied to caller's buffer.
+ * --------------------------------
+ */
+void
+pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
+{
+ if (datalen < 0 || datalen > (msg->len - msg->cursor))
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("insufficient data left in message")));
+ memcpy(buf, &msg->data[msg->cursor], datalen);
+ msg->cursor += datalen;
+}
+
+/* --------------------------------
+ * pq_getmsgtext - get a counted text string (with conversion)
+ *
+ * Always returns a pointer to a freshly palloc'd result.
+ * The result has a trailing null, *and* we return its strlen in *nbytes.
+ * --------------------------------
+ */
+char *
+pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes)
+{
+ char *str;
+ char *p;
+
+ if (rawbytes < 0 || rawbytes > (msg->len - msg->cursor))
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("insufficient data left in message")));
+ str = &msg->data[msg->cursor];
+ msg->cursor += rawbytes;
+
+ p = (char *) palloc(rawbytes + 1);
+ memcpy(p, str, rawbytes);
+ p[rawbytes] = '\0';
+ *nbytes = rawbytes;
+ return p;
+}
+
+/* --------------------------------
+ * pq_getmsgstring - get a null-terminated text string (with conversion)
+ *
+ * May return a pointer directly into the message buffer, or a pointer
+ * to a palloc'd conversion result.
+ * --------------------------------
+ */
+const char *
+pq_getmsgstring(StringInfo msg)
+{
+ char *str;
+ int slen;
+
+ str = &msg->data[msg->cursor];
+
+ /*
+ * It's safe to use strlen() here because a StringInfo is guaranteed to
+ * have a trailing null byte. But check we found a null inside the
+ * message.
+ */
+ slen = strlen(str);
+ if (msg->cursor + slen >= msg->len)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("invalid string in message")));
+ msg->cursor += slen + 1;
+
+ return str;
+}
+
+/* --------------------------------
+ * pq_getmsgend - verify message fully consumed
+ * --------------------------------
+ */
+void
+pq_getmsgend(StringInfo msg)
+{
+ if (msg->cursor != msg->len)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("invalid message format")));
+}
+
+/* --------------------------------
+ * pq_getmsgunreadlen - get length of the unread data in the message
+ * buffer
+ * --------------------------------
+ */
+int
+pq_getmsgunreadlen(StringInfo msg)
+{
+ return msg->len - msg->cursor;
+}
diff --git a/src/gtm/libpq/pqsignal.c b/src/gtm/libpq/pqsignal.c
new file mode 100644
index 0000000000..6bff3d4e14
--- /dev/null
+++ b/src/gtm/libpq/pqsignal.c
@@ -0,0 +1,181 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqsignal.c
+ * reliable BSD-style signal(2) routine stolen from RWW who stole it
+ * from Stevens...
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/libpq/pqsignal.c,v 1.44 2008/01/01 19:45:49 momjian Exp $
+ *
+ * NOTES
+ * This shouldn't be in libpq, but the monitor and some other
+ * things need it...
+ *
+ * A NOTE ABOUT SIGNAL HANDLING ACROSS THE VARIOUS PLATFORMS.
+ *
+ * pg_config.h defines the macro HAVE_POSIX_SIGNALS for some platforms and
+ * not for others. This file and pqsignal.h use that macro to decide
+ * how to handle signalling.
+ *
+ * signal(2) handling - this is here because it affects some of
+ * the frontend commands as well as the backend server.
+ *
+ * Ultrix and SunOS provide BSD signal(2) semantics by default.
+ *
+ * SVID2 and POSIX signal(2) semantics differ from BSD signal(2)
+ * semantics. We can use the POSIX sigaction(2) on systems that
+ * allow us to request restartable signals (SA_RESTART).
+ *
+ * Some systems don't allow restartable signals at all unless we
+ * link to a special BSD library.
+ *
+ * We devoutly hope that there aren't any systems that provide
+ * neither POSIX signals nor BSD signals. The alternative
+ * is to do signal-handler reinstallation, which doesn't work well
+ * at all.
+ * ------------------------------------------------------------------------*/
+
+#include "gtm/gtm.h"
+
+#include <signal.h>
+
+#include "gtm/pqsignal.h"
+
+
+#ifdef HAVE_SIGPROCMASK
+sigset_t UnBlockSig,
+ BlockSig,
+ AuthBlockSig;
+#else
+int UnBlockSig,
+ BlockSig,
+ AuthBlockSig;
+#endif
+
+
+/*
+ * Initialize BlockSig, UnBlockSig, and AuthBlockSig.
+ *
+ * BlockSig is the set of signals to block when we are trying to block
+ * signals. This includes all signals we normally expect to get, but NOT
+ * signals that should never be turned off.
+ *
+ * AuthBlockSig is the set of signals to block during authentication;
+ * it's essentially BlockSig minus SIGTERM, SIGQUIT, SIGALRM.
+ *
+ * UnBlockSig is the set of signals to block when we don't want to block
+ * signals (is this ever nonzero??)
+ */
+void
+pqinitmask(void)
+{
+#ifdef HAVE_SIGPROCMASK
+
+ sigemptyset(&UnBlockSig);
+
+ /* First set all signals, then clear some. */
+ sigfillset(&BlockSig);
+ sigfillset(&AuthBlockSig);
+
+ /*
+ * Unmark those signals that should never be blocked. Some of these signal
+ * names don't exist on all platforms. Most do, but might as well ifdef
+ * them all for consistency...
+ */
+#ifdef SIGTRAP
+ sigdelset(&BlockSig, SIGTRAP);
+ sigdelset(&AuthBlockSig, SIGTRAP);
+#endif
+#ifdef SIGABRT
+ sigdelset(&BlockSig, SIGABRT);
+ sigdelset(&AuthBlockSig, SIGABRT);
+#endif
+#ifdef SIGILL
+ sigdelset(&BlockSig, SIGILL);
+ sigdelset(&AuthBlockSig, SIGILL);
+#endif
+#ifdef SIGFPE
+ sigdelset(&BlockSig, SIGFPE);
+ sigdelset(&AuthBlockSig, SIGFPE);
+#endif
+#ifdef SIGSEGV
+ sigdelset(&BlockSig, SIGSEGV);
+ sigdelset(&AuthBlockSig, SIGSEGV);
+#endif
+#ifdef SIGBUS
+ sigdelset(&BlockSig, SIGBUS);
+ sigdelset(&AuthBlockSig, SIGBUS);
+#endif
+#ifdef SIGSYS
+ sigdelset(&BlockSig, SIGSYS);
+ sigdelset(&AuthBlockSig, SIGSYS);
+#endif
+#ifdef SIGCONT
+ sigdelset(&BlockSig, SIGCONT);
+ sigdelset(&AuthBlockSig, SIGCONT);
+#endif
+
+/* Signals unique to Auth */
+#ifdef SIGQUIT
+ sigdelset(&AuthBlockSig, SIGQUIT);
+#endif
+#ifdef SIGTERM
+ sigdelset(&AuthBlockSig, SIGTERM);
+#endif
+#ifdef SIGALRM
+ sigdelset(&AuthBlockSig, SIGALRM);
+#endif
+#else
+ /* Set the signals we want. */
+ UnBlockSig = 0;
+ BlockSig = sigmask(SIGQUIT) |
+ sigmask(SIGTERM) | sigmask(SIGALRM) |
+ /* common signals between two */
+ sigmask(SIGHUP) |
+ sigmask(SIGINT) | sigmask(SIGUSR1) |
+ sigmask(SIGUSR2) | sigmask(SIGCHLD) |
+ sigmask(SIGWINCH) | sigmask(SIGFPE);
+ AuthBlockSig = sigmask(SIGHUP) |
+ sigmask(SIGINT) | sigmask(SIGUSR1) |
+ sigmask(SIGUSR2) | sigmask(SIGCHLD) |
+ sigmask(SIGWINCH) | sigmask(SIGFPE);
+#endif
+}
+
+
+/* Win32 signal handling is in backend/port/win32/signal.c */
+#ifndef WIN32
+
+/*
+ * Set up a signal handler
+ */
+pqsigfunc
+pqsignal(int signo, pqsigfunc func)
+{
+#if !defined(HAVE_POSIX_SIGNALS)
+ return signal(signo, func);
+#else
+ struct sigaction act,
+ oact;
+
+ act.sa_handler = func;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ if (signo != SIGALRM)
+ act.sa_flags |= SA_RESTART;
+#ifdef SA_NOCLDSTOP
+ if (signo == SIGCHLD)
+ act.sa_flags |= SA_NOCLDSTOP;
+#endif
+ if (sigaction(signo, &act, &oact) < 0)
+ return SIG_ERR;
+ return oact.sa_handler;
+#endif /* !HAVE_POSIX_SIGNALS */
+}
+
+#endif /* WIN32 */
diff --git a/src/gtm/libpq/strlcpy.c b/src/gtm/libpq/strlcpy.c
new file mode 100644
index 0000000000..ae031e244c
--- /dev/null
+++ b/src/gtm/libpq/strlcpy.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * strlcpy.c
+ * strncpy done right
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $
+ *
+ * This file was taken from OpenBSD and is used on platforms that don't
+ * provide strlcpy(). The OpenBSD copyright terms follow.
+ *-------------------------------------------------------------------------
+ */
+
+/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */
+
+/*
+ * Copyright (c) 1998 Todd C. Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "gtm/gtm_c.h"
+
+
+/*
+ * Copy src to string dst of size siz. At most siz-1 characters
+ * will be copied. Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+ char *d = dst;
+ const char *s = src;
+ size_t n = siz;
+
+ /* Copy as many bytes as will fit */
+ if (n != 0)
+ {
+ while (--n != 0)
+ {
+ if ((*d++ = *s++) == '\0')
+ break;
+ }
+ }
+
+ /* Not enough room in dst, add NUL and traverse rest of src */
+ if (n == 0)
+ {
+ if (siz != 0)
+ *d = '\0'; /* NUL-terminate dst */
+ while (*s++)
+ ;
+ }
+
+ return (s - src - 1); /* count does not include NUL */
+}
diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile
new file mode 100644
index 0000000000..7fcdf82a83
--- /dev/null
+++ b/src/gtm/main/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o ../common/libgtm.a ../libpq/libpqcomm.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm
+
+all:gtm
+
+clean:
+ rm -f $(OBJS)
+ rm -f gtm
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
new file mode 100644
index 0000000000..73af34efd6
--- /dev/null
+++ b/src/gtm/main/gtm_seq.c
@@ -0,0 +1,867 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_seq.c
+ * Sequence handling on GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+#include <unistd.h>
+
+typedef struct GTM_SeqInfoHashBucket
+{
+ List *shb_list;
+ GTM_RWLock shb_lock;
+} GTM_SeqInfoHashBucket;
+
+static int SeqStartMagic = 0xfafafafa;
+static int SeqEndMagic = 0xfefefefe;
+
+#define SEQ_HASH_TABLE_SIZE 1024
+static GTM_SeqInfoHashBucket GTMSequences[SEQ_HASH_TABLE_SIZE];
+
+static uint32 seq_gethash(GTM_SequenceKey key);
+static bool seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2);
+static GTM_SeqInfo *seq_find_seqinfo(GTM_SequenceKey seqkey);
+static int seq_release_seqinfo(GTM_SeqInfo *seqinfo);
+static int seq_add_seqinfo(GTM_SeqInfo *seqinfo);
+static int seq_remove_seqinfo(GTM_SeqInfo *seqinfo);
+static GTM_SequenceKey seq_copy_key(GTM_SequenceKey key);
+
+/*
+ * Get the hash value given the sequence key
+ *
+ * XXX This should probably be replaced by a better hash function.
+ */
+static uint32
+seq_gethash(GTM_SequenceKey key)
+{
+ uint32 total = 0;
+ int ii;
+
+ for (ii = 0; ii < key->gsk_keylen; ii++)
+ total += key->gsk_key[ii];
+ return (total % SEQ_HASH_TABLE_SIZE);
+}
+
+/*
+ * Return true if both keys are equal, else return false
+ */
+static bool
+seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2)
+{
+ Assert(key1);
+ Assert(key2);
+
+ if (key1->gsk_keylen != key2->gsk_keylen) return false;
+
+ return (memcmp(key1->gsk_key, key2->gsk_key,
+ Min(key1->gsk_keylen, key2->gsk_keylen)) == 0);
+}
+
+/*
+ * Find the seqinfo structure for the given key. The reference count is
+ * incremented before structure is returned. The caller must release the
+ * reference to the structure when done with it
+ */
+static GTM_SeqInfo *
+seq_find_seqinfo(GTM_SequenceKey seqkey)
+{
+ uint32 hash = seq_gethash(seqkey);
+ GTM_SeqInfoHashBucket *bucket;
+ ListCell *elem;
+ GTM_SeqInfo *curr_seqinfo = NULL;
+
+ bucket = &GTMSequences[hash];
+
+ GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ);
+
+ foreach(elem, bucket->shb_list)
+ {
+ curr_seqinfo = (GTM_SeqInfo *) lfirst(elem);
+ if (seq_keys_equal(curr_seqinfo->gs_key, seqkey))
+ break;
+ curr_seqinfo = NULL;
+ }
+
+ if (curr_seqinfo != NULL)
+ {
+ GTM_RWLockAcquire(&curr_seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+ if (curr_seqinfo->gs_state != SEQ_STATE_ACTIVE)
+ {
+ elog(LOG, "Sequence not active");
+ GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+ return NULL;
+ }
+ Assert(curr_seqinfo->gs_ref_count != SEQ_MAX_REFCOUNT);
+ curr_seqinfo->gs_ref_count++;
+ GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+ }
+ GTM_RWLockRelease(&bucket->shb_lock);
+
+ return curr_seqinfo;
+}
+
+/*
+ * Release previously grabbed reference to the structure. If the structure is
+ * marked for deletion, it will be removed from the global array and released
+ */
+static int
+seq_release_seqinfo(GTM_SeqInfo *seqinfo)
+{
+ bool remove = false;
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+ Assert(seqinfo->gs_ref_count > 0);
+ seqinfo->gs_ref_count--;
+
+ if ((seqinfo->gs_state == SEQ_STATE_DELETED) &&
+ (seqinfo->gs_ref_count == 0))
+ remove = true;
+
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ /*
+ * Remove the structure from the global hash table
+ */
+ if (remove) seq_remove_seqinfo(seqinfo);
+ return 0;
+}
+
+/*
+ * Add a seqinfo structure to the global hash table.
+ */
+static int
+seq_add_seqinfo(GTM_SeqInfo *seqinfo)
+{
+ uint32 hash = seq_gethash(seqinfo->gs_key);
+ GTM_SeqInfoHashBucket *bucket;
+ ListCell *elem;
+
+ bucket = &GTMSequences[hash];
+
+ GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE);
+
+ foreach(elem, bucket->shb_list)
+ {
+ GTM_SeqInfo *curr_seqinfo = NULL;
+ curr_seqinfo = (GTM_SeqInfo *) lfirst(elem);
+
+ if (seq_keys_equal(curr_seqinfo->gs_key, seqinfo->gs_key))
+ {
+ GTM_RWLockRelease(&bucket->shb_lock);
+ ereport(LOG,
+ (EEXIST,
+ errmsg("Sequence with the given key already exists")));
+ return EEXIST;
+ }
+ }
+
+ /*
+ * Safe to add the structure to the list
+ */
+ bucket->shb_list = lappend(bucket->shb_list, seqinfo);
+ GTM_RWLockRelease(&bucket->shb_lock);
+
+ return 0;
+}
+
+/*
+ * Remove the seqinfo structure from the global hash table. If the structure is
+ * currently referenced by some other thread, just mark the structure for
+ * deletion and it will be deleted by the final reference is released.
+ */
+static int
+seq_remove_seqinfo(GTM_SeqInfo *seqinfo)
+{
+ uint32 hash = seq_gethash(seqinfo->gs_key);
+ GTM_SeqInfoHashBucket *bucket;
+
+ bucket = &GTMSequences[hash];
+
+ GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE);
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ if (seqinfo->gs_ref_count > 1)
+ {
+ seqinfo->gs_state = SEQ_STATE_DELETED;
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ GTM_RWLockRelease(&bucket->shb_lock);
+ return EBUSY;
+ }
+
+ bucket->shb_list = list_delete(bucket->shb_list, seqinfo);
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ GTM_RWLockRelease(&bucket->shb_lock);
+
+ return 0;
+}
+
+static GTM_SequenceKey
+seq_copy_key(GTM_SequenceKey key)
+{
+ GTM_SequenceKey retkey = NULL;
+
+ /*
+ * We must use the TopMostMemoryContext because the sequence information is
+ * not bound to a thread and can outlive any of the thread specific
+ * contextes.
+ */
+ retkey = (GTM_SequenceKey) MemoryContextAlloc(TopMostMemoryContext,
+ sizeof(GTM_SequenceKeyData) +
+ key->gsk_keylen);
+
+ if (retkey == NULL)
+ ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+ retkey->gsk_keylen = key->gsk_keylen;
+ retkey->gsk_key = (char *)((char *)retkey + sizeof (GTM_SequenceKeyData));
+
+ memcpy(retkey->gsk_key, key->gsk_key, key->gsk_keylen);
+ return retkey;
+}
+
+/*
+ * Initialize a new sequence. Optionally set the initial value of the sequence.
+ */
+int
+GTM_SeqOpen(GTM_SequenceKey seqkey,
+ GTM_Sequence increment_by,
+ GTM_Sequence minval,
+ GTM_Sequence maxval,
+ GTM_Sequence startval,
+ bool cycle)
+{
+ GTM_SeqInfo *seqinfo = NULL;
+ int errcode = 0;
+ seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo));
+
+ if (seqinfo == NULL)
+ ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+ GTM_RWLockInit(&seqinfo->gs_lock);
+
+ seqinfo->gs_ref_count = 0;
+ seqinfo->gs_key = seq_copy_key(seqkey);
+ seqinfo->gs_state = SEQ_STATE_ACTIVE;
+ seqinfo->gs_called = false;
+
+ /*
+ * Set the increment. Default is 1
+ */
+ if (SEQVAL_IS_VALID(increment_by))
+ seqinfo->gs_increment_by = increment_by;
+ else
+ seqinfo->gs_increment_by = 1;
+
+ /*
+ * If minval is specified, set the minvalue to the given minval,
+ * otherwise set to the defaults
+ */
+ if (SEQVAL_IS_VALID(minval))
+ seqinfo->gs_min_value = minval;
+ else if (SEQ_IS_ASCENDING(seqinfo))
+ seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_ASCEND;
+ else
+ seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_DESCEND;
+
+ /*
+ * If maxval is specfied, set the maxvalue to the given maxval, otherwise
+ * set to the defaults depending on whether the seqeunce is ascending or
+ * descending. Also do some basic contraint checks
+ */
+ if (SEQVAL_IS_VALID(maxval))
+ {
+ if (maxval < seqinfo->gs_min_value)
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Max value must be greater than min value")));
+ seqinfo->gs_max_value = maxval;
+ }
+ else if (SEQ_IS_ASCENDING(seqinfo))
+ seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_ASCEND;
+ else
+ seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_DESCEND;
+
+
+ /*
+ * Set the startval if specified. Do some basic checks like startval must
+ * be in-between min and max values
+ */
+ if (SEQVAL_IS_VALID(startval))
+ {
+ if (startval < seqinfo->gs_min_value)
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Start value must be greater than or equal to the min value")));
+
+ if (startval > seqinfo->gs_max_value)
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Start value must be less than or equal to the max value")));
+
+ seqinfo->gs_init_value = seqinfo->gs_value = startval;
+ }
+ else if (SEQ_IS_ASCENDING(seqinfo))
+ seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_ASCEND;
+ else
+ seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_DESCEND;
+
+ /*
+ * Should we wrap around ?
+ */
+ seqinfo->gs_cycle = cycle;
+
+ if ((errcode = seq_add_seqinfo(seqinfo)))
+ {
+ GTM_RWLockDestroy(&seqinfo->gs_lock);
+ pfree(seqinfo->gs_key);
+ pfree(seqinfo);
+ }
+ return errcode;
+}
+
+/*
+ * Restore a sequence.
+ */
+static int
+GTM_SeqRestore(GTM_SequenceKey seqkey,
+ GTM_Sequence increment_by,
+ GTM_Sequence minval,
+ GTM_Sequence maxval,
+ GTM_Sequence startval,
+ GTM_Sequence curval,
+ int32 state,
+ bool cycle,
+ bool called)
+{
+ GTM_SeqInfo *seqinfo = NULL;
+ int errcode = 0;
+ seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo));
+
+ if (seqinfo == NULL)
+ ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+ GTM_RWLockInit(&seqinfo->gs_lock);
+
+ seqinfo->gs_ref_count = 0;
+ seqinfo->gs_key = seq_copy_key(seqkey);
+ seqinfo->gs_state = state;
+ seqinfo->gs_called = called;
+
+ seqinfo->gs_increment_by = increment_by;
+ seqinfo->gs_min_value = minval;
+ seqinfo->gs_max_value = maxval;
+
+ seqinfo->gs_init_value = startval;
+ seqinfo->gs_value = curval;
+
+ /*
+ * Should we wrap around ?
+ */
+ seqinfo->gs_cycle = cycle;
+
+ if ((errcode = seq_add_seqinfo(seqinfo)))
+ {
+ GTM_RWLockDestroy(&seqinfo->gs_lock);
+ pfree(seqinfo->gs_key);
+ pfree(seqinfo);
+ }
+ return errcode;
+}
+/*
+ * Destroy the given sequence
+ */
+int
+GTM_SeqClose(GTM_SequenceKey seqkey)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+ if (seqinfo != NULL)
+ {
+ seq_remove_seqinfo(seqinfo);
+ pfree(seqinfo->gs_key);
+ pfree(seqinfo);
+ return 0;
+ }
+ else
+ return EINVAL;
+}
+
+/*
+ * Get current value for the sequence without incrementing it
+ */
+GTM_Sequence
+GTM_SeqGetCurrent(GTM_SequenceKey seqkey)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+ GTM_Sequence value;
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+ return EINVAL;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ /*
+ * If this is the first call to the sequence, set the value to the start
+ * value and mark the sequence as 'called'
+ */
+ if (!SEQ_IS_CALLED(seqinfo))
+ {
+ seqinfo->gs_value = seqinfo->gs_init_value;
+ seqinfo->gs_called = true;
+ }
+ value = seqinfo->gs_value;
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ return value;
+}
+
+/*
+ * Get next vlaue for the sequence
+ */
+GTM_Sequence
+GTM_SeqGetNext(GTM_SequenceKey seqkey)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+ GTM_Sequence value;
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+ return EINVAL;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ /*
+ * If the sequence is called for the first time, initialize the value and
+ * return the start value
+ */
+ if (!SEQ_IS_CALLED(seqinfo))
+ {
+ value = seqinfo->gs_value = seqinfo->gs_init_value;
+ seqinfo->gs_called = true;
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ return value;
+ }
+
+ if (SEQ_IS_ASCENDING(seqinfo))
+ {
+ /*
+ * Check if the sequence is about to wrap-around. If the sequence does
+ * not support wrap-around, throw an error and return
+ * InvalidSequenceValue
+ */
+ if (seqinfo->gs_max_value - seqinfo->gs_increment_by >= seqinfo->gs_value)
+ value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by;
+ else if (SEQ_IS_CYCLE(seqinfo))
+ value = seqinfo->gs_value = seqinfo->gs_min_value;
+ else
+ {
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ ereport(LOG,
+ (ERANGE,
+ errmsg("Sequence reached maximum value")));
+ return InvalidSequenceValue;
+ }
+ }
+ else
+ {
+ /*
+ * Check if the sequence is about to wrap-around. If the sequence does
+ * not support wrap-around, throw an error and return
+ * InvalidSequenceValue, otherwise wrap around the sequence and reset
+ * it to the max value.
+ *
+ * Note: The gs_increment_by is a signed integer and is negative for
+ * descending sequences. So we don't need special handling below
+ */
+ if (seqinfo->gs_min_value - seqinfo->gs_increment_by <= seqinfo->gs_value)
+ value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by;
+ else if (SEQ_IS_CYCLE(seqinfo))
+ value = seqinfo->gs_value = seqinfo->gs_max_value;
+ else
+ {
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ ereport(LOG,
+ (ERANGE,
+ errmsg("Sequence reached minimum value")));
+ return InvalidSequenceValue;
+ }
+
+ }
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ return value;
+}
+
+/*
+ * Reset the sequence
+ */
+int
+GTM_SeqReset(GTM_SequenceKey seqkey)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+ return EINVAL;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+ seqinfo->gs_value = seqinfo->gs_init_value;
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+
+ seq_release_seqinfo(seqinfo);
+ return 0;
+}
+
+void
+GTM_InitSeqManager(void)
+{
+ int ii;
+
+ for (ii = 0; ii < SEQ_HASH_TABLE_SIZE; ii++)
+ {
+ GTMSequences[ii].shb_list = NIL;
+ GTM_RWLockInit(&GTMSequences[ii].shb_lock);
+ }
+}
+
+/*
+ * Process MSG_SEQUENCE_INIT message
+ */
+void
+ProcessSequenceInitCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ GTM_Sequence increment, minval, maxval, startval;
+ bool cycle;
+ StringInfoData buf;
+ int errcode;
+ MemoryContext oldContext;
+
+ /*
+ * Get the sequence key
+ */
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+ /*
+ * Read various sequence parameters
+ */
+ memcpy(&increment, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+ sizeof (GTM_Sequence));
+ memcpy(&minval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+ sizeof (GTM_Sequence));
+ memcpy(&maxval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+ sizeof (GTM_Sequence));
+ memcpy(&startval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+ sizeof (GTM_Sequence));
+
+ cycle = pq_getmsgbyte(message);
+
+
+ /*
+ * We must use the TopMostMemoryContext because the sequence information is
+ * not bound to a thread and can outlive any of the thread specific
+ * contextes.
+ */
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ if (GTM_SeqOpen(&seqkey, increment, minval, maxval, startval, cycle))
+ ereport(ERROR,
+ (errcode,
+ errmsg("Failed to open a new sequence")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_getmsgend(message);
+
+ /*
+ * Send a SUCCESS message back to the client
+ */
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_INIT_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_GET_CURRENT message
+ */
+void
+ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ StringInfoData buf;
+ GTM_Sequence seqval;
+
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+ seqval = GTM_SeqGetCurrent(&seqkey);
+ if (!SEQVAL_IS_VALID(seqval))
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Can not get current value of the sequence")));
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_GET_NEXT message
+ */
+void
+ProcessSequenceGetNextCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ StringInfoData buf;
+ GTM_Sequence seqval;
+
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+ seqval = GTM_SeqGetNext(&seqkey);
+ if (!SEQVAL_IS_VALID(seqval))
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Can not get current value of the sequence")));
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_RESET message
+ */
+void
+ProcessSequenceResetCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ StringInfoData buf;
+ int errcode;
+
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+ if ((errcode = GTM_SeqReset(&seqkey)))
+ ereport(ERROR,
+ (errcode,
+ errmsg("Can not reset the sequence")));
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_CLOSE message
+ */
+void
+ProcessSequenceCloseCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ StringInfoData buf;
+ int errcode;
+
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+ if ((errcode = GTM_SeqClose(&seqkey)))
+ ereport(ERROR,
+ (errcode,
+ errmsg("Can not close the sequence")));
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+}
+
+void
+GTM_SaveSeqInfo(int ctlfd)
+{
+ GTM_SeqInfoHashBucket *bucket;
+ ListCell *elem;
+ GTM_SeqInfo *seqinfo = NULL;
+ int hash;
+
+ for (hash = 0; hash < SEQ_HASH_TABLE_SIZE; hash++)
+ {
+ bucket = &GTMSequences[hash];
+
+ GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ);
+
+ foreach(elem, bucket->shb_list)
+ {
+ seqinfo = (GTM_SeqInfo *) lfirst(elem);
+ if (seqinfo == NULL)
+ break;
+
+ if (seqinfo->gs_state == SEQ_STATE_DELETED)
+ continue;
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ);
+
+ write(ctlfd, &SeqStartMagic, sizeof (SeqStartMagic));
+ write(ctlfd, &seqinfo->gs_key->gsk_keylen, sizeof (uint32));
+ write(ctlfd, seqinfo->gs_key->gsk_key, seqinfo->gs_key->gsk_keylen);
+ write(ctlfd, &seqinfo->gs_value, sizeof (GTM_Sequence));
+ write(ctlfd, &seqinfo->gs_init_value, sizeof (GTM_Sequence));
+ write(ctlfd, &seqinfo->gs_increment_by, sizeof (GTM_Sequence));
+ write(ctlfd, &seqinfo->gs_min_value, sizeof (GTM_Sequence));
+ write(ctlfd, &seqinfo->gs_max_value, sizeof (GTM_Sequence));
+ write(ctlfd, &seqinfo->gs_cycle, sizeof (bool));
+ write(ctlfd, &seqinfo->gs_called, sizeof (bool));
+ write(ctlfd, &seqinfo->gs_state, sizeof (int32));
+ write(ctlfd, &SeqEndMagic, sizeof(SeqEndMagic));
+
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ }
+
+ GTM_RWLockRelease(&bucket->shb_lock);
+ }
+
+}
+
+void
+GTM_RestoreSeqInfo(int ctlfd)
+{
+ int magic;
+
+ if (ctlfd == -1)
+ return;
+
+ while (read(ctlfd, &magic, sizeof (SeqStartMagic)) == sizeof (SeqStartMagic))
+ {
+ GTM_SequenceKeyData seqkey;
+ GTM_Sequence increment_by;
+ GTM_Sequence minval;
+ GTM_Sequence maxval;
+ GTM_Sequence startval;
+ GTM_Sequence curval;
+ int32 state;
+ bool cycle;
+ bool called;
+
+ if (magic != SeqStartMagic)
+ {
+ elog(LOG, "Start magic mismatch %x - %x", magic, SeqStartMagic);
+ break;
+ }
+
+ if (read(ctlfd, &seqkey.gsk_keylen, sizeof (uint32)) != sizeof (uint32))
+ {
+ elog(LOG, "Failed to read keylen");
+ break;
+ }
+
+ seqkey.gsk_key = palloc(seqkey.gsk_keylen);
+ read(ctlfd, seqkey.gsk_key, seqkey.gsk_keylen);
+
+ read(ctlfd, &curval, sizeof (GTM_Sequence));
+ read(ctlfd, &startval, sizeof (GTM_Sequence));
+ read(ctlfd, &increment_by, sizeof (GTM_Sequence));
+ read(ctlfd, &minval, sizeof (GTM_Sequence));
+ read(ctlfd, &maxval, sizeof (GTM_Sequence));
+ read(ctlfd, &cycle, sizeof (bool));
+ read(ctlfd, &called, sizeof (bool));
+ read(ctlfd, &state, sizeof (int32));
+ read(ctlfd, &magic, sizeof(SeqEndMagic));
+
+ if (magic != SeqEndMagic)
+ {
+ elog(WARNING, "Corrupted control file");
+ return;
+ }
+
+ GTM_SeqRestore(&seqkey, increment_by, minval, maxval, startval, curval,
+ state, cycle, called);
+ }
+}
diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c
new file mode 100644
index 0000000000..5c9b4b2ae5
--- /dev/null
+++ b/src/gtm/main/gtm_snap.c
@@ -0,0 +1,466 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_snap.c
+ * Snapshot handling on GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/elog.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/assert.h"
+#include "gtm/stringinfo.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+
+
+/*
+ * Get snapshot for the given transactions. If this is the first call in the
+ * transaction, a fresh snapshot is taken and returned back. For a serializable
+ * transaction, repeated calls to the function will return the same snapshot.
+ * For a read-committed transaction, fresh snapshot is taken every time and
+ * returned to the caller.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax. It is used as follows:
+ * All xact IDs < xmin are considered finished.
+ * All xact IDs >= xmax are considered still running.
+ * For an xact ID xmin <= xid < xmax, consult list to see whether
+ * it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot.
+ *
+ * We also update the following global variables:
+ * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
+ * running transactions
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+GTM_Snapshot
+GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], int txn_count, int *status)
+{
+ GlobalTransactionId xmin;
+ GlobalTransactionId xmax;
+ GlobalTransactionId globalxmin;
+ int count = 0;
+ ListCell *elem = NULL;
+ int ii;
+
+ /*
+ * Instead of allocating memory for a snapshot, we use the snapshot of the
+ * first transaction in the given array. The same snapshot will later be
+ * copied to other transaction info structures.
+ */
+ GTM_TransactionInfo *mygtm_txninfo = NULL;
+ GTM_Snapshot snapshot = NULL;
+
+ memset(status, 0, sizeof (int) * txn_count);
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+
+ /*
+ * If the transaction does not exist, just mark the status field with
+ * a STATUS_ERROR code
+ */
+ if (mygtm_txninfo == NULL)
+ status[ii] = STATUS_ERROR;
+ else if (snapshot == NULL)
+ snapshot = &mygtm_txninfo->gti_current_snapshot;
+ }
+
+ /*
+ * If no valid transaction exists in the array, send an error message back.
+ * Otherwise, we should still get the snapshot and send it back. The
+ * invalid transaction ids are marked separately in the status array.
+ */
+ if (snapshot == NULL)
+ return NULL;
+
+ Assert(snapshot != NULL);
+
+ if (snapshot->sn_xip == NULL)
+ {
+ /*
+ * First call for this snapshot
+ */
+ snapshot->sn_xip = (GlobalTransactionId *)
+ palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+ if (snapshot->sn_xip == NULL)
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("out of memory")));
+ }
+
+ /*
+ * It is sufficient to get shared lock on ProcArrayLock, even if we are
+ * going to set MyProc->xmin.
+ */
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ);
+
+ /* xmax is always latestCompletedXid + 1 */
+ xmax = GTMTransactions.gt_latestCompletedXid;
+ Assert(GlobalTransactionIdIsNormal(xmax));
+ GlobalTransactionIdAdvance(xmax);
+
+ /* initialize xmin calculation with xmax */
+ globalxmin = xmin = xmax;
+
+ /*
+ * Spin over transaction list checking xid, xmin, and subxids. The goal is to
+ * gather all active xids and find the lowest xmin
+ */
+ foreach(elem, GTMTransactions.gt_open_transactions)
+ {
+ volatile GTM_TransactionInfo *gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem);
+ GlobalTransactionId xid;
+
+ /* Don't take into account LAZY VACUUMs */
+ if (gtm_txninfo->gti_vacuum)
+ continue;
+
+ /* Update globalxmin to be the smallest valid xmin */
+ xid = gtm_txninfo->gti_xmin; /* fetch just once */
+ if (GlobalTransactionIdIsNormal(xid) &&
+ GlobalTransactionIdPrecedes(xid, globalxmin))
+ globalxmin = xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = gtm_txninfo->gti_gxid;
+
+ /*
+ * If the transaction has been assigned an xid < xmax we add it to the
+ * snapshot, and update xmin if necessary. There's no need to store
+ * XIDs >= xmax, since we'll treat them as running anyway. We don't
+ * bother to examine their subxids either.
+ *
+ * We don't include our own XID (if any) in the snapshot, but we must
+ * include it into xmin.
+ */
+ if (GlobalTransactionIdIsNormal(xid))
+ {
+ /*
+ * Unlike Postgres, we include the GXID of the current transaction
+ * as well in the snapshot. This is necessary because the same
+ * snapshot is shared by multiple backends through GTM proxy and
+ * the GXID will vary for each backend.
+ *
+ * XXX We should confirm that this does not have any adverse effect
+ * on the MVCC visibility and check if any changes are related to
+ * the MVCC checks because of the change
+ */
+ if (GlobalTransactionIdFollowsOrEquals(xid, xmax))
+ continue;
+ if (GlobalTransactionIdPrecedes(xid, xmin))
+ xmin = xid;
+ snapshot->sn_xip[count++] = xid;
+ }
+ }
+
+ /*
+ * Update globalxmin to include actual process xids. This is a slightly
+ * different way of computing it than GetOldestXmin uses, but should give
+ * the same result.
+ */
+ if (GlobalTransactionIdPrecedes(xmin, globalxmin))
+ globalxmin = xmin;
+
+ GTMTransactions.gt_recent_global_xmin = globalxmin;
+
+ snapshot->sn_xmin = xmin;
+ snapshot->sn_xmax = xmax;
+ snapshot->sn_xcnt = count;
+ snapshot->sn_recent_global_xmin = globalxmin;
+
+ /*
+ * Now, before the proc array lock is released, set the xmin in the txninfo
+ * structures of all the transactions.
+ */
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ GTM_Snapshot mysnap = NULL;
+
+ /*
+ * We have already gone through all the transaction handles above and
+ * marked the invalid handles with STATUS_ERROR
+ */
+ if (status[ii] == STATUS_ERROR)
+ continue;
+
+ mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+ mysnap = &mygtm_txninfo->gti_current_snapshot;
+
+ if (GTM_IsTransSerializable(mygtm_txninfo))
+ {
+ if ((mygtm_txninfo->gti_snapshot_set) && (txn_count > 1))
+ elog(ERROR, "Grouped snapshot can only include first snapshot in Serializable transaction");
+
+ if (!mygtm_txninfo->gti_snapshot_set)
+ {
+ /*
+ * For the first transaction in the array, the snapshot is
+ * already set.
+ */
+ if (snapshot != mysnap)
+ {
+ if (mysnap->sn_xip == NULL)
+ {
+ /*
+ * First call for this snapshot
+ */
+ mysnap->sn_xip = (GlobalTransactionId *)
+ palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+ if (mysnap->sn_xip == NULL)
+ ereport(ERROR, (ENOMEM, errmsg("out of memory")));
+ }
+ mysnap->sn_xmin = snapshot->sn_xmin;
+ mysnap->sn_xmax = snapshot->sn_xmax;
+ mysnap->sn_xcnt = snapshot->sn_xcnt;
+ mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin;
+ memcpy(mysnap->sn_xip, snapshot->sn_xip,
+ sizeof (GlobalTransactionId) * snapshot->sn_xcnt);
+ }
+ mygtm_txninfo->gti_snapshot_set = true;
+ }
+ }
+ else if (snapshot != mysnap)
+ {
+ if (mysnap->sn_xip == NULL)
+ {
+ /*
+ * First call for this snapshot
+ */
+ mysnap->sn_xip = (GlobalTransactionId *)
+ palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+ if (mysnap->sn_xip == NULL)
+ ereport(ERROR, (ENOMEM, errmsg("out of memory")));
+ }
+ mysnap->sn_xmin = snapshot->sn_xmin;
+ mysnap->sn_xmax = snapshot->sn_xmax;
+ mysnap->sn_xcnt = snapshot->sn_xcnt;
+ mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin;
+ memcpy(mysnap->sn_xip, snapshot->sn_xip,
+ sizeof (GlobalTransactionId) * snapshot->sn_xcnt);
+ }
+
+ if ((mygtm_txninfo != NULL) &&
+ (!GlobalTransactionIdIsValid(mygtm_txninfo->gti_xmin)))
+ mygtm_txninfo->gti_xmin = xmin;
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+ elog(DEBUG1, "GTM_GetTransactionSnapshot: (%u:%u:%u:%u)",
+ snapshot->sn_xmin, snapshot->sn_xmax,
+ snapshot->sn_xcnt, snapshot->sn_recent_global_xmin);
+ return snapshot;
+}
+
+/*
+ * Process MSG_SNAPSHOT_GET command
+ */
+void
+ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ int isgxid = 0;
+ GTM_Snapshot snapshot;
+ MemoryContext oldContext;
+ bool canbe_grouped;
+ int status;
+ int txn_count = 1;
+
+ /*
+ * This is used by the GTM proxy to decide whether to group this snapshot
+ * request with some other snapshot request from some other backend.
+ *
+ * This is mostly useless for the GTM server.
+ */
+ canbe_grouped = pq_getmsgbyte(message);
+
+ isgxid = pq_getmsgbyte(message);
+
+ if (isgxid)
+ {
+ const char *data = NULL;
+ Assert(!get_gxid);
+ data = pq_getmsgbytes(message, sizeof (gxid));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid, data, sizeof (gxid));
+ txn = GTM_GXIDToHandle(gxid);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn, data, sizeof (txn));
+ }
+ pq_getmsgend(message);
+
+ if (get_gxid)
+ {
+ Assert(!isgxid);
+ gxid = GTM_GetGlobalTransactionId(txn);
+ if (gxid == InvalidGlobalTransactionId)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a new transaction id")));
+ }
+
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ /*
+ * Get a fresh snapshot
+ */
+ if ((snapshot = GTM_GetTransactionSnapshot(&txn, 1, &status)) == NULL)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a snapshot")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, get_gxid ? SNAPSHOT_GXID_GET_RESULT : SNAPSHOT_GET_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+ pq_sendbytes(&buf, (char *)&status, sizeof(int) * txn_count);
+ pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId));
+ pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int));
+ pq_sendbytes(&buf, (char *)snapshot->sn_xip,
+ sizeof(GlobalTransactionId) * snapshot->sn_xcnt);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+
+ return;
+}
+
+/*
+ * Process MSG_SNAPSHOT_GET_MULTI command
+ */
+void
+ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+ GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ GTM_Snapshot snapshot;
+ MemoryContext oldContext;
+ int txn_count;
+ int ii;
+ int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+
+ txn_count = pq_getmsgint(message, sizeof (int));
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ isgxid[ii] = pq_getmsgbyte(message);
+ if (isgxid[ii])
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+ txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn[ii], data, sizeof (txn[ii]));
+ }
+ }
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ /*
+ * Get a fresh snapshot
+ */
+ if ((snapshot = GTM_GetTransactionSnapshot(txn, txn_count, status)) == NULL)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a snapshot")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SNAPSHOT_GET_MULTI_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+ pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+ pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId));
+ pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int));
+ pq_sendbytes(&buf, (char *)snapshot->sn_xip,
+ sizeof(GlobalTransactionId) * snapshot->sn_xcnt);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+
+ return;
+}
+
+/*
+ * Free the snapshot data. The snapshot itself is not freed though
+ */
+void
+GTM_FreeSnapshotData(GTM_Snapshot snapshot)
+{
+ if (snapshot == NULL)
+ return;
+
+ if (snapshot->sn_xip != NULL)
+ {
+ Assert(snapshot->sn_xcnt);
+ pfree(snapshot->sn_xip);
+ snapshot->sn_xip = NULL;
+ }
+}
diff --git a/src/gtm/main/gtm_stat.c b/src/gtm/main/gtm_stat.c
new file mode 100644
index 0000000000..fac6b64c24
--- /dev/null
+++ b/src/gtm/main/gtm_stat.c
@@ -0,0 +1,37 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stat.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+
+uint32 GTM_Message_Stats[MSG_MAX_MESSAGE_TYPE];
+uint32 GTM_Result_Stats[GTM_MAX_RESULT_TYPE];
+
+void
+gtm_msgstat_increment(int type)
+{
+ GTM_Message_Stats[type]++;
+}
+
+void
+gtm_resultstat_increment(int type)
+{
+ GTM_Result_Stats[type]++;
+}
+
+void
+gtm_print_stats(void)
+{
+
+}
diff --git a/src/gtm/main/gtm_stats.c b/src/gtm/main/gtm_stats.c
new file mode 100644
index 0000000000..aba1a219fb
--- /dev/null
+++ b/src/gtm/main/gtm_stats.c
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stats.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+typedef struct GTM_Stats
+{
+ int GTM_RecvMessages[GTM_MAX_MESSAGE_TYPE];
+ int GTM_SentMessages[GTM_MAX_MESSAGE_TYPE];
+ float GTM_RecvBytes;
+ float GTM_SentBytes;
+} GTM_Stats;
+
+
diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c
new file mode 100644
index 0000000000..61ea640ab5
--- /dev/null
+++ b/src/gtm/main/gtm_thread.c
@@ -0,0 +1,336 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_thread.c
+ * Thread handling
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <pthread.h>
+#include "gtm/gtm.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/libpq.h"
+
+static void *GTM_ThreadMainWrapper(void *argp);
+static void GTM_ThreadCleanup(void *argp);
+
+GTM_Threads GTMThreadsData;
+GTM_Threads *GTMThreads = &GTMThreadsData;
+
+#define GTM_MIN_THREADS 32 /* Provision for minimum threads */
+#define GTM_MAX_THREADS 1024 /* Max threads allowed in the GTM */
+#define GTMThreadsFull (GTMThreads->gt_thread_count == GTMThreads->gt_array_size)
+
+/*
+ * Add the given thrinfo structure to the global array, expanding it if
+ * necessary
+ */
+int
+GTM_ThreadAdd(GTM_ThreadInfo *thrinfo)
+{
+ int ii;
+
+ GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+ if (GTMThreadsFull)
+ {
+ uint32 newsize;
+
+ /*
+ * TODO Optimize lock management by not holding any locks during memory
+ * allocation
+ */
+ if (GTMThreads->gt_array_size == GTM_MAX_THREADS)
+ elog(ERROR, "Too many threads active");
+
+ if (GTMThreads->gt_array_size == 0)
+ newsize = GTM_MIN_THREADS;
+ else
+ {
+ /*
+ * We ran out of the array size. Just double the size, bound by the
+ * upper limit
+ */
+ newsize = GTMThreads->gt_array_size * 2;
+ }
+
+ /* Can't have more than GTM_MAX_THREADS */
+ if (newsize > GTM_MAX_THREADS)
+ newsize = GTM_MAX_THREADS;
+
+ if (GTMThreads->gt_threads == NULL)
+ GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize);
+ else
+ {
+ void *old_ptr = GTMThreads->gt_threads;
+ GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize);
+ memcpy(GTMThreads->gt_threads, old_ptr,
+ GTMThreads->gt_array_size * sizeof (GTM_ThreadInfo *));
+ pfree(old_ptr);
+ }
+
+ GTMThreads->gt_array_size = newsize;
+ }
+
+ /*
+ * Now that we have free entries in the array, find a free slot and add the
+ * thrinfo pointer to it.
+ *
+ * TODO Optimize this later by tracking few free slots and reusing them.
+ * The free slots can be updated when a thread exits and reused when a new
+ * thread is added to the pool.
+ */
+ for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+ {
+ if (GTMThreads->gt_threads[ii] == NULL)
+ {
+ GTMThreads->gt_threads[ii] = thrinfo;
+ GTMThreads->gt_thread_count++;
+ break;
+ }
+ }
+ GTM_RWLockRelease(&GTMThreads->gt_lock);
+
+ /*
+ * Track the slot information in the thrinfo. This is useful to quickly
+ * find the slot given the thrinfo structure.
+ */
+ thrinfo->thr_localid = ii;
+ return ii;
+}
+
+int
+GTM_ThreadRemove(GTM_ThreadInfo *thrinfo)
+{
+ int ii;
+ GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+ for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+ {
+ if (GTMThreads->gt_threads[ii] == thrinfo)
+ break;
+ }
+
+ if (ii == GTMThreads->gt_array_size)
+ elog(ERROR, "Thread (%p) not found ", thrinfo);
+
+ GTMThreads->gt_threads[ii] = NULL;
+ GTMThreads->gt_thread_count--;
+ GTM_RWLockRelease(&GTMThreads->gt_lock);
+
+ pfree(thrinfo);
+
+ return 0;
+}
+
+/*
+ * Create a new thread and assign the given connection to it.
+ *
+ * This function is responsible for setting up the various memory contextes for
+ * the thread as well as registering this thread with the Thread Manager.
+ *
+ * Upon successful creation, the thread will start running the given
+ * "startroutine". The thread information is returned to the calling process.
+ */
+GTM_ThreadInfo *
+GTM_ThreadCreate(GTM_ConnectionInfo *conninfo,
+ void *(* startroutine)(void *))
+{
+ GTM_ThreadInfo *thrinfo;
+ int err;
+
+ /*
+ * We are still running in the context of the main thread. So the
+ * allocation below would last as long as the main thread exists or the
+ * memory is explicitely freed.
+ */
+ thrinfo = (GTM_ThreadInfo *)palloc0(sizeof (GTM_ThreadInfo));
+
+ thrinfo->thr_conn = conninfo;
+ GTM_RWLockInit(&thrinfo->thr_lock);
+
+ /*
+ * The thread status is set to GTM_THREAD_STARTING and will be changed by
+ * the thread itself when it actually starts executing
+ */
+ thrinfo->thr_status = GTM_THREAD_STARTING;
+
+ /*
+ * Install the ThreadInfo structure in the global array. We do this before
+ * starting the thread
+ */
+ if (GTM_ThreadAdd(thrinfo) == -1)
+ elog(ERROR, "Error starting a new thread");
+
+ /*
+ * Set up memory contextes before actually starting the threads
+ *
+ * The TopThreadContext is a child of TopMemoryContext and it will last as
+ * long as the main process or this thread lives
+ *
+ * Thread context is not shared between other threads
+ */
+ thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext,
+ "TopMemoryContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE,
+ false);
+
+ /*
+ * Since the thread is not yes started, TopMemoryContext still points to
+ * the context of the calling thread
+ */
+ thrinfo->thr_parent_context = TopMemoryContext;
+
+ /*
+ * Each thread gets its own ErrorContext and its a child of ErrorContext of
+ * the main process
+ *
+ * This is a thread-specific context and is not shared between other
+ * threads
+ */
+ thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
+ "ErrorContext",
+ 8 * 1024,
+ 8 * 1024,
+ 8 * 1024,
+ false);
+
+ thrinfo->thr_startroutine = startroutine;
+
+ /*
+ * Now start the thread. The thread will start executing the given
+ * "startroutine". The thrinfo structure is also passed to the thread. Any
+ * additional parameters should be passed via the thrinfo strcuture.
+ *
+ * Return the thrinfo structure to the caller
+ */
+ if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper,
+ thrinfo)))
+ ereport(ERROR,
+ (err,
+ errmsg("Failed to create a new thread: error %d", err)));
+
+ return thrinfo;
+}
+
+/*
+ * Exit the current thread
+ */
+void
+GTM_ThreadExit(void)
+{
+ /* XXX To be implemented */
+}
+
+int
+GTM_ThreadJoin(GTM_ThreadInfo *thrinfo)
+{
+ int error;
+ void *data;
+
+ error = pthread_join(thrinfo->thr_id, &data);
+
+ return error;
+}
+
+/*
+ * Get thread information for the given thread, identified by the
+ * thread_id
+ */
+GTM_ThreadInfo *
+GTM_GetThreadInfo(GTM_ThreadID thrid)
+{
+
+ return NULL;
+}
+
+/*
+ * Cleanup routine for the thread
+ */
+static void
+GTM_ThreadCleanup(void *argp)
+{
+ GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+
+ elog(LOG, "Cleaning up thread state");
+
+ /*
+ * TODO Close the open connection.
+ */
+ StreamClose(thrinfo->thr_conn->con_port->sock);
+
+ /* Free the port */
+ ConnFree(thrinfo->thr_conn->con_port);
+ thrinfo->thr_conn->con_port = NULL;
+
+ /* Free the connection info structure */
+ pfree(thrinfo->thr_conn);
+ thrinfo->thr_conn = NULL;
+
+ /*
+ * Switch to the memory context of the main process so that we can free up
+ * our memory contextes easily.
+ *
+ * XXX We don't setup cleanup handlers for the main process. So this
+ * routine would never be called for the main process/thread
+ */
+ MemoryContextSwitchTo(thrinfo->thr_parent_context);
+
+ MemoryContextDelete(thrinfo->thr_message_context);
+ thrinfo->thr_message_context = NULL;
+
+ MemoryContextDelete(thrinfo->thr_error_context);
+ thrinfo->thr_error_context = NULL;
+
+ MemoryContextDelete(thrinfo->thr_thread_context);
+ thrinfo->thr_thread_context = NULL;
+
+ /*
+ * TODO Now cleanup the thrinfo structure itself and remove it from the global
+ * array.
+ */
+ GTM_ThreadRemove(thrinfo);
+
+ /*
+ * Reset the thread-specific information. This should be done only after we
+ * are sure that memory contextes are not required
+ *
+ * Note: elog calls need memory contextes, so no elog calls beyond this
+ * point.
+ */
+ SetMyThreadInfo(NULL);
+
+ return;
+}
+
+/*
+ * A wrapper around the start routine of the thread. This helps us doing any
+ * initialization and setting up cleanup handlers before the main routine is
+ * started
+ */
+void *
+GTM_ThreadMainWrapper(void *argp)
+{
+ GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+
+ pthread_detach(thrinfo->thr_id);
+
+ SetMyThreadInfo(thrinfo);
+ MemoryContextSwitchTo(TopMemoryContext);
+
+ pthread_cleanup_push(GTM_ThreadCleanup, thrinfo);
+ thrinfo->thr_startroutine(thrinfo);
+ pthread_cleanup_pop(1);
+
+ return thrinfo;
+}
diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c
new file mode 100644
index 0000000000..6090ae10fb
--- /dev/null
+++ b/src/gtm/main/gtm_txn.c
@@ -0,0 +1,1521 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_txn.c
+ * Transaction handling
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/elog.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/assert.h"
+#include "gtm/stringinfo.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+#include <unistd.h>
+
+/* Local functions */
+static XidStatus GlobalTransactionIdGetStatus(GlobalTransactionId transactionId);
+static bool GTM_SetDoVacuum(GTM_TransactionHandle handle);
+
+GTM_Transactions GTMTransactions;
+
+void
+GTM_InitTxnManager(void)
+{
+ int ii;
+
+ memset(&GTMTransactions, 0, sizeof (GTM_Transactions));
+
+ for (ii = 0; ii < GTM_MAX_GLOBAL_TRANSACTIONS; ii++)
+ {
+ GTM_TransactionInfo *gtm_txninfo = &GTMTransactions.gt_transactions_array[ii];
+ gtm_txninfo->gti_in_use = false;
+ GTM_RWLockInit(&gtm_txninfo->gti_lock);
+ }
+
+ /*
+ * XXX When GTM is stopped and restarted, it must start assinging GXIDs
+ * greater than the previously assgined values. If it was a clean shutdown,
+ * the GTM can store the last assigned value at a known location on
+ * permanent storage and read it back when it's restarted. It will get
+ * trickier for GTM failures.
+ *
+ * TODO We skip thia part for the prototype.
+ */
+ GTMTransactions.gt_nextXid = FirstNormalGlobalTransactionId;
+
+ /*
+ * XXX The gt_oldestXid is the cluster level oldest Xid
+ */
+ GTMTransactions.gt_oldestXid = FirstNormalGlobalTransactionId;
+
+ /*
+ * XXX Compute various xid limits to avoid wrap-around related database
+ * corruptions. Again, this is not implemeneted for the prototype
+ */
+ GTMTransactions.gt_xidVacLimit = InvalidGlobalTransactionId;
+ GTMTransactions.gt_xidWarnLimit = InvalidGlobalTransactionId;
+ GTMTransactions.gt_xidStopLimit = InvalidGlobalTransactionId;
+ GTMTransactions.gt_xidWrapLimit = InvalidGlobalTransactionId;
+
+ /*
+ * XXX Newest XID that is committed or aborted
+ */
+ GTMTransactions.gt_latestCompletedXid = FirstNormalGlobalTransactionId;
+
+ /*
+ * Initialize the locks to protect various XID fields as well as the linked
+ * list of transactions
+ */
+ GTM_RWLockInit(&GTMTransactions.gt_XidGenLock);
+ GTM_RWLockInit(&GTMTransactions.gt_TransArrayLock);
+
+ /*
+ * Initialize the list
+ */
+ GTMTransactions.gt_open_transactions = NIL;
+ GTMTransactions.gt_lastslot = -1;
+
+ GTMTransactions.gt_gtm_state = GTM_STARTING;
+
+ return;
+}
+
+/*
+ * Get the status of current or past transaction.
+ */
+static XidStatus
+GlobalTransactionIdGetStatus(GlobalTransactionId transactionId)
+{
+ XidStatus xidstatus;
+
+ /*
+ * Also, check to see if the transaction ID is a permanent one.
+ */
+ if (!GlobalTransactionIdIsNormal(transactionId))
+ {
+ if (GlobalTransactionIdEquals(transactionId, BootstrapGlobalTransactionId))
+ return TRANSACTION_STATUS_COMMITTED;
+ if (GlobalTransactionIdEquals(transactionId, FrozenGlobalTransactionId))
+ return TRANSACTION_STATUS_COMMITTED;
+ return TRANSACTION_STATUS_ABORTED;
+ }
+
+ /*
+ * TODO To be implemeneted
+ */
+ return xidstatus;
+}
+
+/*
+ * Given the GXID, find the corresponding transaction handle.
+ */
+GTM_TransactionHandle
+GTM_GXIDToHandle(GlobalTransactionId gxid)
+{
+ ListCell *elem = NULL;
+ GTM_TransactionInfo *gtm_txninfo = NULL;
+
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ);
+
+ foreach(elem, GTMTransactions.gt_open_transactions)
+ {
+ gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem);
+ if (GlobalTransactionIdEquals(gtm_txninfo->gti_gxid, gxid))
+ break;
+ gtm_txninfo = NULL;
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+ if (gtm_txninfo != NULL)
+ return gtm_txninfo->gti_handle;
+ else
+ return InvalidTransactionHandle;
+}
+
+/*
+ * Given the transaction handle, find the corresponding transaction info
+ * structure
+ *
+ * Note: Since a transaction handle is just an index into the global array,
+ * this function should be very quick. We should turn into an inline future for
+ * fast path.
+ */
+GTM_TransactionInfo *
+GTM_HandleToTransactionInfo(GTM_TransactionHandle handle)
+{
+ GTM_TransactionInfo *gtm_txninfo = NULL;
+
+ if ((handle < 0) || (handle > GTM_MAX_GLOBAL_TRANSACTIONS))
+ {
+ ereport(WARNING,
+ (ERANGE, errmsg("Invalid transaction handle: %d", handle)));
+ return NULL;
+ }
+
+ gtm_txninfo = &GTMTransactions.gt_transactions_array[handle];
+
+ if (!gtm_txninfo->gti_in_use)
+ {
+ ereport(WARNING,
+ (ERANGE, errmsg("Invalid transaction handle, txn_info not in use")));
+ return NULL;
+ }
+
+ return gtm_txninfo;
+}
+
+/*
+ * Remove the given transaction info structures from the global array. If the
+ * calling thread does not have enough cached structures, we in fact keep the
+ * structure in the global array and also add it to the list of cached
+ * structures for this thread. This ensures that the next transaction starting
+ * in this thread can quickly get a free slot in the array of transactions and
+ * also avoid repeated malloc/free of the structures.
+ *
+ * Also compute the latestCompletedXid.
+ */
+static void
+GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count)
+{
+ int ii;
+
+ /*
+ * Remove the transaction structure from the global list of open
+ * transactions
+ */
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ if (gtm_txninfo[ii] == NULL)
+ continue;
+
+ GTMTransactions.gt_open_transactions = list_delete(GTMTransactions.gt_open_transactions, gtm_txninfo[ii]);
+
+ if (GlobalTransactionIdIsNormal(gtm_txninfo[ii]->gti_gxid) &&
+ GlobalTransactionIdFollowsOrEquals(gtm_txninfo[ii]->gti_gxid,
+ GTMTransactions.gt_latestCompletedXid))
+ GTMTransactions.gt_latestCompletedXid = gtm_txninfo[ii]->gti_gxid;
+
+
+ elog(DEBUG1, "GTM_RemoveTransInfoMulti: removing transaction id %u, %lu",
+ gtm_txninfo[ii]->gti_gxid, gtm_txninfo[ii]->gti_thread_id);
+ /*
+ * Now mark the transaction as aborted and mark the structure as not-in-use
+ */
+ gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED;
+ gtm_txninfo[ii]->gti_nodecount = 0;
+ gtm_txninfo[ii]->gti_in_use = false;
+ gtm_txninfo[ii]->gti_snapshot_set = false;
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+ return;
+}
+
+/*
+ * Remove all transaction infos associated with the caller thread and the given
+ * backend
+ *
+ * Also compute the latestCompletedXid.
+ */
+void
+GTM_RemoveAllTransInfos(int backend_id)
+{
+ ListCell *cell, *prev;
+ GTM_ThreadID thread_id;
+
+ thread_id = pthread_self();
+
+ /*
+ * Scan the global list of open transactions
+ */
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+ prev = NULL;
+ cell = list_head(GTMTransactions.gt_open_transactions);
+ while (cell != NULL)
+ {
+ GTM_TransactionInfo *gtm_txninfo = lfirst(cell);
+ /* check if current entry is associated with the thread */
+ if ((gtm_txninfo->gti_in_use) &&
+ (gtm_txninfo->gti_thread_id == thread_id) &&
+ ((gtm_txninfo->gti_backend_id == backend_id) || (backend_id == -1)))
+ {
+ /* remove the entry */
+ GTMTransactions.gt_open_transactions = list_delete_cell(GTMTransactions.gt_open_transactions, cell, prev);
+
+ /* update the latestComletedXid */
+ if (GlobalTransactionIdIsNormal(gtm_txninfo->gti_gxid) &&
+ GlobalTransactionIdFollowsOrEquals(gtm_txninfo->gti_gxid,
+ GTMTransactions.gt_latestCompletedXid))
+ GTMTransactions.gt_latestCompletedXid = gtm_txninfo->gti_gxid;
+
+ elog(DEBUG1, "GTM_RemoveAllTransInfos: removing transaction id %u, %lu:%lu",
+ gtm_txninfo->gti_gxid, gtm_txninfo->gti_thread_id, thread_id);
+ /*
+ * Now mark the transaction as aborted and mark the structure as not-in-use
+ */
+ gtm_txninfo->gti_state = GTM_TXN_ABORTED;
+ gtm_txninfo->gti_nodecount = 0;
+ gtm_txninfo->gti_in_use = false;
+ gtm_txninfo->gti_snapshot_set = false;
+
+ /* move to next cell in the list */
+ if (prev)
+ cell = lnext(prev);
+ else
+ cell = list_head(GTMTransactions.gt_open_transactions);
+ }
+ else
+ {
+ prev = cell;
+ cell = lnext(cell);
+ }
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+ return;
+}
+/*
+ * GlobalTransactionIdDidCommit
+ * True iff transaction associated with the identifier did commit.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ */
+bool /* true if given transaction committed */
+GlobalTransactionIdDidCommit(GlobalTransactionId transactionId)
+{
+ XidStatus xidstatus;
+
+ xidstatus = GlobalTransactionIdGetStatus(transactionId);
+
+ /*
+ * If it's marked committed, it's committed.
+ */
+ if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+ return true;
+
+ /*
+ * It's not committed.
+ */
+ return false;
+}
+
+/*
+ * GlobalTransactionIdDidAbort
+ * True iff transaction associated with the identifier did abort.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ */
+bool /* true if given transaction aborted */
+GlobalTransactionIdDidAbort(GlobalTransactionId transactionId)
+{
+ XidStatus xidstatus;
+
+ xidstatus = GlobalTransactionIdGetStatus(transactionId);
+
+ /*
+ * If it's marked aborted, it's aborted.
+ */
+ if (xidstatus == TRANSACTION_STATUS_ABORTED)
+ return true;
+
+ /*
+ * It's not aborted.
+ */
+ return false;
+}
+
+/*
+ * GlobalTransactionIdPrecedes --- is id1 logically < id2?
+ */
+bool
+GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+ /*
+ * If either ID is a permanent XID then we can just do unsigned
+ * comparison. If both are normal, do a modulo-2^31 comparison.
+ */
+ int32 diff;
+
+ if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+ return (id1 < id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff < 0);
+}
+
+/*
+ * GlobalTransactionIdPrecedesOrEquals --- is id1 logically <= id2?
+ */
+bool
+GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+ int32 diff;
+
+ if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+ return (id1 <= id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff <= 0);
+}
+
+/*
+ * GlobalTransactionIdFollows --- is id1 logically > id2?
+ */
+bool
+GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+ int32 diff;
+
+ if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+ return (id1 > id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff > 0);
+}
+
+/*
+ * GlobalTransactionIdFollowsOrEquals --- is id1 logically >= id2?
+ */
+bool
+GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+ int32 diff;
+
+ if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+ return (id1 >= id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff >= 0);
+}
+
+
+/*
+ * Set that the transaction is doing vacuum
+ *
+ */
+static bool
+GTM_SetDoVacuum(GTM_TransactionHandle handle)
+{
+ GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(handle);
+
+ if (gtm_txninfo == NULL)
+ ereport(ERROR, (EINVAL, errmsg("Invalid transaction handle")));
+
+ gtm_txninfo->gti_vacuum = true;
+ return true;
+}
+
+/*
+ * Allocate the next XID for my new transaction
+ *
+ * The new XID is also stored into the transaction info structure of the given
+ * transaction before returning.
+ */
+GlobalTransactionId
+GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count)
+{
+ GlobalTransactionId xid, start_xid = InvalidGlobalTransactionId;
+ GTM_TransactionInfo *gtm_txninfo = NULL;
+ int ii;
+
+ GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+
+ if (GTMTransactions.gt_gtm_state == GTM_SHUTTING_DOWN)
+ {
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+ ereport(ERROR, (EINVAL, errmsg("GTM shutting down -- can not issue new transaction ids")));
+ return InvalidGlobalTransactionId;
+ }
+
+
+ /*
+ * If we are allocating the first XID of a new page of the commit log,
+ * zero out that commit-log page before returning. We must do this while
+ * holding XidGenLock, else another xact could acquire and commit a later
+ * XID before we zero the page. Fortunately, a page of the commit log
+ * holds 32K or more transactions, so we don't have to do this very often.
+ *
+ ExtendCLOG(xid);
+ */
+
+ /*
+ * Now advance the nextXid counter. This must not happen until after we
+ * have successfully completed ExtendCLOG() --- if that routine fails, we
+ * want the next incoming transaction to try it again. We cannot assign
+ * more XIDs until there is CLOG space for them.
+ */
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ xid = GTMTransactions.gt_nextXid;
+
+ if (!GlobalTransactionIdIsValid(start_xid))
+ start_xid = xid;
+
+ /*----------
+ * Check to see if it's safe to assign another XID. This protects against
+ * catastrophic data loss due to XID wraparound. The basic rules are:
+ *
+ * If we're past xidVacLimit, start trying to force autovacuum cycles.
+ * If we're past xidWarnLimit, start issuing warnings.
+ * If we're past xidStopLimit, refuse to execute transactions, unless
+ * we are running in a standalone backend (which gives an escape hatch
+ * to the DBA who somehow got past the earlier defenses).
+ *
+ * Test is coded to fall out as fast as possible during normal operation,
+ * ie, when the vac limit is set and we haven't violated it.
+ *----------
+ */
+ if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidVacLimit) &&
+ GlobalTransactionIdIsValid(GTMTransactions.gt_xidVacLimit))
+ {
+ if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidStopLimit))
+ {
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("database is not accepting commands to avoid wraparound data loss in database ")));
+ }
+ else if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidWarnLimit))
+ ereport(WARNING,
+ (errmsg("database must be vacuumed within %u transactions",
+ GTMTransactions.gt_xidWrapLimit - xid)));
+ }
+
+ GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid);
+ gtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+ Assert(gtm_txninfo);
+ gtm_txninfo->gti_gxid = xid;
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+
+ return start_xid;
+}
+
+/*
+ * Allocate the next XID for my new transaction
+ *
+ * The new XID is also stored into the transaction info structure of the given
+ * transaction before returning.
+ */
+GlobalTransactionId
+GTM_GetGlobalTransactionId(GTM_TransactionHandle handle)
+{
+ return GTM_GetGlobalTransactionIdMulti(&handle, 1);
+}
+
+/*
+ * Read nextXid but don't allocate it.
+ */
+GlobalTransactionId
+ReadNewGlobalTransactionId(void)
+{
+ GlobalTransactionId xid;
+
+ GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_READ);
+ xid = GTMTransactions.gt_nextXid;
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+
+ return xid;
+}
+
+/*
+ * Set the nextXid.
+ *
+ * The GXID is usually read from a control file and set when the GTM is
+ * started. When the GTM is finally shutdown, the next to-be-assigned GXID is
+ * stroed in the control file.
+ *
+ * XXX We don't yet handle any crash recovery. So if the GTM is shutdown
+ */
+void
+SetNextGlobalTransactionId(GlobalTransactionId gxid)
+{
+ GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+ GTMTransactions.gt_nextXid = gxid;
+ GTMTransactions.gt_gtm_state = GTM_RUNNING;
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+ return;
+}
+
+
+/* Transaction Control */
+int
+GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id,
+ GTM_IsolationLevel isolevel[],
+ bool readonly[],
+ GTMProxy_ConnID connid[],
+ int txn_count,
+ GTM_TransactionHandle txns[])
+{
+ GTM_TransactionInfo *gtm_txninfo[txn_count];
+ MemoryContext oldContext;
+ int kk;
+
+ memset(gtm_txninfo, 0, sizeof (gtm_txninfo));
+
+ /*
+ * XXX We should allocate the transaction info structure in the
+ * top-most memory context instead of a thread context. This is
+ * necessary because the transaction may outlive the thread which
+ * started the transaction. Also, since the structures are stored in
+ * the global array, it's dangerous to free the structures themselves
+ * without removing the corresponding references from the global array
+ */
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ for (kk = 0; kk < txn_count; kk++)
+ {
+ int ii, jj, startslot;
+
+ /*
+ * We had no cached slots. Now find a free slot in the transation array
+ * and store the transaction info structure there
+ */
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
+ startslot = GTMTransactions.gt_lastslot + 1;
+ if (startslot >= GTM_MAX_GLOBAL_TRANSACTIONS)
+ startslot = 0;
+
+ for (ii = startslot, jj = 0;
+ jj < GTM_MAX_GLOBAL_TRANSACTIONS;
+ ii = (ii + 1) % GTM_MAX_GLOBAL_TRANSACTIONS, jj++)
+ {
+ if (GTMTransactions.gt_transactions_array[ii].gti_in_use == false)
+ {
+ gtm_txninfo[kk] = &GTMTransactions.gt_transactions_array[ii];
+ break;
+ }
+
+ if (ii == GTMTransactions.gt_lastslot)
+ {
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+ ereport(ERROR,
+ (ERANGE, errmsg("Max transaction limit reached")));
+ }
+ }
+
+
+ gtm_txninfo[kk]->gti_gxid = InvalidGlobalTransactionId;
+ gtm_txninfo[kk]->gti_xmin = InvalidGlobalTransactionId;
+ gtm_txninfo[kk]->gti_state = GTM_TXN_STARTING;
+ gtm_txninfo[kk]->gti_coordid = coord_id;
+
+ gtm_txninfo[kk]->gti_isolevel = isolevel[kk];
+ gtm_txninfo[kk]->gti_readonly = readonly[kk];
+ gtm_txninfo[kk]->gti_backend_id = connid[kk];
+ gtm_txninfo[kk]->gti_in_use = true;
+
+ gtm_txninfo[kk]->gti_handle = ii;
+ gtm_txninfo[kk]->gti_vacuum = false;
+ gtm_txninfo[kk]->gti_thread_id = pthread_self();
+ GTMTransactions.gt_lastslot = ii;
+
+ txns[kk] = ii;
+
+ /*
+ * Add the structure to the global list of open transactions. We should
+ * call add the element to the list in the context of TopMostMemoryContext
+ * because the list is global and any memory allocation must outlive the
+ * thread context
+ */
+ GTMTransactions.gt_open_transactions = lappend(GTMTransactions.gt_open_transactions, gtm_txninfo[kk]);
+ }
+
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+ MemoryContextSwitchTo(oldContext);
+
+ return txn_count;
+}
+
+/* Transaction Control */
+GTM_TransactionHandle
+GTM_BeginTransaction(GTM_CoordinatorId coord_id,
+ GTM_IsolationLevel isolevel,
+ bool readonly)
+{
+ GTM_TransactionHandle txn;
+ GTMProxy_ConnID connid = -1;
+
+ GTM_BeginTransactionMulti(coord_id, &isolevel, &readonly, &connid, 1, &txn);
+ return txn;
+}
+
+/*
+ * Same as GTM_RollbackTransaction, but takes GXID as input
+ */
+int
+GTM_RollbackTransactionGXID(GlobalTransactionId gxid)
+{
+ GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+ return GTM_RollbackTransaction(txn);
+}
+
+/*
+ * Rollback multiple transactions in one go
+ */
+int
+GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[])
+{
+ GTM_TransactionInfo *gtm_txninfo[txn_count];
+ int ii;
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]);
+
+ if (gtm_txninfo[ii] == NULL)
+ {
+ status[ii] = STATUS_ERROR;
+ continue;
+ }
+
+ /*
+ * Mark the transaction as being aborted
+ */
+ GTM_RWLockAcquire(&gtm_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE);
+ gtm_txninfo[ii]->gti_state = GTM_TXN_ABORT_IN_PROGRESS;
+ GTM_RWLockRelease(&gtm_txninfo[ii]->gti_lock);
+ status[ii] = STATUS_OK;
+ }
+
+ GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count);
+
+ return txn_count;
+}
+
+/*
+ * Rollback a transaction
+ */
+int
+GTM_RollbackTransaction(GTM_TransactionHandle txn)
+{
+ int status;
+ GTM_RollbackTransactionMulti(&txn, 1, &status);
+ return status;
+}
+
+
+/*
+ * Same as GTM_CommitTransaction but takes GXID as input
+ */
+int
+GTM_CommitTransactionGXID(GlobalTransactionId gxid)
+{
+ GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+ return GTM_CommitTransaction(txn);
+}
+
+/*
+ * Commit multiple transactions in one go
+ */
+int
+GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[])
+{
+ GTM_TransactionInfo *gtm_txninfo[txn_count];
+ int ii;
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]);
+
+ if (gtm_txninfo[ii] == NULL)
+ {
+ status[ii] = STATUS_ERROR;
+ continue;
+ }
+ /*
+ * Mark the transaction as being aborted
+ */
+ GTM_RWLockAcquire(&gtm_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE);
+ gtm_txninfo[ii]->gti_state = GTM_TXN_COMMIT_IN_PROGRESS;
+ GTM_RWLockRelease(&gtm_txninfo[ii]->gti_lock);
+ status[ii] = STATUS_OK;
+ }
+
+ GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count);
+
+ return txn_count;
+}
+
+/*
+ * Commit a transaction
+ */
+int
+GTM_CommitTransaction(GTM_TransactionHandle txn)
+{
+ int status;
+ GTM_CommitTransactionMulti(&txn, 1, &status);
+ return status;
+}
+
+/*
+ * Prepare a transaction
+ */
+int
+GTM_PrepareTransaction(GTM_TransactionHandle txn,
+ uint32 nodecnt,
+ PGXC_NodeId nodes[])
+{
+ GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn);
+
+ if (gtm_txninfo == NULL)
+ return STATUS_ERROR;
+
+ /*
+ * Mark the transaction as being aborted
+ */
+ GTM_RWLockAcquire(&gtm_txninfo->gti_lock, GTM_LOCKMODE_WRITE);
+
+ gtm_txninfo->gti_state = GTM_TXN_PREPARE_IN_PROGRESS;
+ gtm_txninfo->gti_nodecount = nodecnt;
+ if (gtm_txninfo->gti_nodes == NULL)
+ gtm_txninfo->gti_nodes = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES);
+ memcpy(gtm_txninfo->gti_nodes, nodes, sizeof (PGXC_NodeId) * nodecnt);
+
+ GTM_RWLockRelease(&gtm_txninfo->gti_lock);
+
+ return STATUS_OK;
+}
+
+/*
+ * Same as GTM_PrepareTransaction but takes GXID as input
+ */
+int
+GTM_PrepareTransactionGXID(GlobalTransactionId gxid,
+ uint32 nodecnt,
+ PGXC_NodeId nodes[])
+{
+ GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+ return GTM_PrepareTransaction(txn, nodecnt, nodes);
+}
+
+/*
+ * Get status of the given transaction
+ */
+GTM_TransactionStates
+GTM_GetStatus(GTM_TransactionHandle txn)
+{
+ GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn);
+ return gtm_txninfo->gti_state;
+}
+
+/*
+ * Same as GTM_GetStatus but takes GXID as input
+ */
+GTM_TransactionStates
+GTM_GetStatusGXID(GlobalTransactionId gxid)
+{
+ GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+ return GTM_GetStatus(txn);
+}
+
+/*
+ * Process MSG_TXN_BEGIN message
+ */
+void
+ProcessBeginTransactionCommand(Port *myport, StringInfo message)
+{
+ GTM_IsolationLevel txn_isolation_level;
+ bool txn_read_only;
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ MemoryContext oldContext;
+
+ txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+ txn_read_only = pq_getmsgbyte(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Start a new transaction
+ *
+ * XXX Port should contain Coordinator Id - replace 0 with that
+ */
+ txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+ if (txn == InvalidTransactionHandle)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to start a new transaction")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_BEGIN_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn, sizeof(txn));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID message
+ */
+void
+ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message)
+{
+ GTM_IsolationLevel txn_isolation_level;
+ bool txn_read_only;
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ MemoryContext oldContext;
+
+ txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+ txn_read_only = pq_getmsgbyte(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Start a new transaction
+ *
+ * XXX Port should contain Coordinator Id - replace 0 with that
+ */
+ txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+ if (txn == InvalidTransactionHandle)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to start a new transaction")));
+
+ gxid = GTM_GetGlobalTransactionId(txn);
+ if (gxid == InvalidGlobalTransactionId)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a new transaction id")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ elog(LOG, "Sending transaction id %u", gxid);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID_AUTOVACUUM message
+ */
+void
+ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message)
+{
+ GTM_IsolationLevel txn_isolation_level;
+ bool txn_read_only;
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ MemoryContext oldContext;
+
+ elog(DEBUG3, "Inside ProcessBeginTransactionGetGXIDAutovacuumCommand");
+
+ txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+ txn_read_only = pq_getmsgbyte(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Start a new transaction
+ *
+ * XXX Port should contain Coordinator Id - replace 0 with that
+ */
+ txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+ if (txn == InvalidTransactionHandle)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to start a new transaction")));
+
+ gxid = GTM_GetGlobalTransactionId(txn);
+ if (gxid == InvalidGlobalTransactionId)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a new transaction id")));
+
+ /* Indicate that it is for autovacuum */
+ GTM_SetDoVacuum(txn);
+
+ MemoryContextSwitchTo(oldContext);
+
+ elog(DEBUG3, "Sending transaction id %d", gxid);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID_MULTI message
+ */
+void
+ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message)
+{
+ GTM_IsolationLevel txn_isolation_level[GTM_MAX_GLOBAL_TRANSACTIONS];
+ bool txn_read_only[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int txn_count;
+ StringInfoData buf;
+ GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+ GlobalTransactionId gxid, end_gxid;
+ GTMProxy_ConnID txn_connid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ MemoryContext oldContext;
+ int count;
+ int ii;
+
+ txn_count = pq_getmsgint(message, sizeof (int));
+
+ if (txn_count <= 0)
+ elog(PANIC, "Zero or less transaction count");
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ txn_isolation_level[ii] = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+ txn_read_only[ii] = pq_getmsgbyte(message);
+ txn_connid[ii] = pq_getmsgint(message, sizeof (GTMProxy_ConnID));
+ }
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Start a new transaction
+ *
+ * XXX Port should contain Coordinator Id - replace 0 with that
+ */
+ count = GTM_BeginTransactionMulti(0, txn_isolation_level, txn_read_only, txn_connid,
+ txn_count, txn);
+ if (count != txn_count)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to start %d new transactions", txn_count)));
+
+ gxid = GTM_GetGlobalTransactionIdMulti(txn, txn_count);
+ if (gxid == InvalidGlobalTransactionId)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get a new transaction id")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ end_gxid = gxid + txn_count;
+ if (end_gxid < gxid)
+ end_gxid += FirstNormalGlobalTransactionId;
+
+ elog(LOG, "Sending transaction ids from %u to %u", gxid, end_gxid);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_BEGIN_GETGXID_MULTI_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_COMMIT message
+ */
+void
+ProcessCommitTransactionCommand(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ int isgxid = 0;
+ MemoryContext oldContext;
+ int status = STATUS_OK;
+
+ isgxid = pq_getmsgbyte(message);
+
+ if (isgxid)
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid, data, sizeof (gxid));
+ txn = GTM_GXIDToHandle(gxid);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn, data, sizeof (txn));
+ }
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Commit the transaction
+ */
+ status = GTM_CommitTransaction(txn);
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_COMMIT_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_sendint(&buf, status, sizeof(status));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_ROLLBACK message
+ */
+void
+ProcessRollbackTransactionCommand(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ int isgxid = 0;
+ MemoryContext oldContext;
+ int status = STATUS_OK;
+
+ isgxid = pq_getmsgbyte(message);
+
+ if (isgxid)
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid, data, sizeof (gxid));
+ txn = GTM_GXIDToHandle(gxid);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn, data, sizeof (txn));
+ }
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Commit the transaction
+ */
+ status = GTM_RollbackTransaction(txn);
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_sendint(&buf, status, sizeof(status));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+
+/*
+ * Process MSG_TXN_COMMIT_MULTI message
+ */
+void
+ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+ GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ MemoryContext oldContext;
+ int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int txn_count, count;
+ int ii;
+
+ txn_count = pq_getmsgint(message, sizeof (int));
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ isgxid[ii] = pq_getmsgbyte(message);
+ if (isgxid[ii])
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+ txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+ elog(DEBUG1, "ProcessCommitTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn[ii], data, sizeof (txn[ii]));
+ elog(DEBUG1, "ProcessCommitTransactionCommandMulti: handle(%u)", txn[ii]);
+ }
+ }
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Commit the transaction
+ */
+ count = GTM_CommitTransactionMulti(txn, txn_count, status);
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_COMMIT_MULTI_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+ pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_ROLLBACK_MULTI message
+ */
+void
+ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+ GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+ MemoryContext oldContext;
+ int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+ int txn_count, count;
+ int ii;
+
+ txn_count = pq_getmsgint(message, sizeof (int));
+
+ for (ii = 0; ii < txn_count; ii++)
+ {
+ isgxid[ii] = pq_getmsgbyte(message);
+ if (isgxid[ii])
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+ txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+ elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn[ii], data, sizeof (txn[ii]));
+ elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: handle(%u)", txn[ii]);
+ }
+ }
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Commit the transaction
+ */
+ count = GTM_RollbackTransactionMulti(txn, txn_count, status);
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_ROLLBACK_MULTI_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+ pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_PREPARE message
+ */
+void
+ProcessPrepareTransactionCommand(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ int isgxid = 0;
+ int nodecnt;
+ PGXC_NodeId *nodes;
+ MemoryContext oldContext;
+
+ isgxid = pq_getmsgbyte(message);
+
+ if (isgxid)
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (gxid));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&gxid, data, sizeof (gxid));
+ txn = GTM_GXIDToHandle(gxid);
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message, sizeof (txn));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn, data, sizeof (txn));
+ }
+
+ nodecnt = pq_getmsgint(message, sizeof (nodecnt));
+ nodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * nodecnt);
+ memcpy(nodes, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * nodecnt),
+ sizeof (PGXC_NodeId) * nodecnt);
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Prepare the transaction
+ */
+ if (GTM_PrepareTransaction(txn, nodecnt, nodes) != STATUS_OK)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to commit the transaction")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ pfree(nodes);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_PREPARE_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+/*
+ * Process MSG_TXN_GET_GXID message
+ */
+void
+ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ GTM_TransactionHandle txn;
+ GlobalTransactionId gxid;
+ const char *data;
+ MemoryContext oldContext;
+
+ elog(DEBUG3, "Inside ProcessGetGXIDTransactionCommand");
+
+ data = pq_getmsgbytes(message, sizeof (txn));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&txn, data, sizeof (txn));
+
+ pq_getmsgend(message);
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Get the transaction id for the given global transaction
+ */
+ gxid = GTM_GetGlobalTransactionId(txn);
+ if (GlobalTransactionIdIsValid(gxid))
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to get the transaction id")));
+
+ MemoryContextSwitchTo(oldContext);
+
+ elog(DEBUG3, "Sending transaction id %d", gxid);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_GET_GXID_RESULT, 4);
+ if (myport->is_proxy)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&txn, sizeof(txn));
+ pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+ pq_endmessage(myport, &buf);
+
+ if (!myport->is_proxy)
+ pq_flush(myport);
+ return;
+}
+
+
+/*
+ * Mark GTM as shutting down. This point onwards no new GXID are issued to
+ * ensure that the last GXID recorded in the control file remains sane
+ */
+void
+GTM_SetShuttingDown(void)
+{
+ GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+ GTMTransactions.gt_gtm_state = GTM_SHUTTING_DOWN;
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+}
+
+void
+GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid)
+{
+ GlobalTransactionId saved_gxid;
+
+ if (ctlfd != -1)
+ {
+ if ((read(ctlfd, &saved_gxid, sizeof (saved_gxid)) != sizeof (saved_gxid)) &&
+ (!GlobalTransactionIdIsValid(next_gxid)))
+ return;
+ if (!GlobalTransactionIdIsValid(next_gxid))
+ next_gxid = saved_gxid;
+ }
+
+ elog(LOG, "Restoring last GXID to %u\n", next_gxid);
+
+ if (GlobalTransactionIdIsValid(next_gxid))
+ SetNextGlobalTransactionId(next_gxid);
+ /* Set this otherwise a strange snapshot might be returned for the first one */
+ GTMTransactions.gt_latestCompletedXid = next_gxid - 1;
+ return;
+}
+
+void
+GTM_SaveTxnInfo(int ctlfd)
+{
+ GlobalTransactionId next_gxid;
+
+ next_gxid = ReadNewGlobalTransactionId();
+
+ elog(LOG, "Saving transaction info - next_gxid: %u", next_gxid);
+
+ write(ctlfd, &next_gxid, sizeof (next_gxid));
+}
+/*
+ * TODO
+ */
+int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt);
+
+/*
+ * TODO
+ */
+uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt);
+
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
new file mode 100644
index 0000000000..0ef09c436a
--- /dev/null
+++ b/src/gtm/main/main.c
@@ -0,0 +1,1370 @@
+/*-------------------------------------------------------------------------
+ *
+ * main.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <stdio.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/elog.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/pqsignal.h"
+#include "gtm/pqformat.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/gtm_msg.h"
+
+extern int optind;
+extern char *optarg;
+
+#define GTM_MAX_PATH 1024
+#define GTM_DEFAULT_HOSTNAME "*"
+#define GTM_DEFAULT_PORT 6666
+#define GTM_CONTROL_FILE "gtm.control"
+#define GTM_PID_FILE "gtm.pid"
+#define GTM_LOG_FILE "gtm.log"
+
+static char *progname = "gtm";
+char *ListenAddresses;
+int GTMPortNumber;
+char GTMControlFile[GTM_MAX_PATH];
+char *GTMDataDir;
+
+/* The socket(s) we're listening to. */
+#define MAXLISTEN 64
+static int ListenSocket[MAXLISTEN];
+
+pthread_key_t threadinfo_key;
+static bool GTMAbortPending = false;
+
+static Port *ConnCreate(int serverFd);
+static int ServerLoop(void);
+static int initMasks(fd_set *rmask);
+void *GTM_ThreadMain(void *argp);
+static int GTMAddConnection(Port *port);
+static int ReadCommand(Port *myport, StringInfo inBuf);
+
+static void ProcessCommand(Port *myport, StringInfo input_message);
+static void ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+
+static void GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id);
+static void GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id);
+
+static bool CreateOptsFile(int argc, char *argv[]);
+static void CreateDataDirLockFile(void);
+static void CreateLockFile(const char *filename, const char *refName);
+static void ChangeToDataDir(void);
+static void checkDataDir(void);
+static void DeleteLockFile(const char *filename);
+
+/*
+ * One-time initialization. It's called immediately after the main process
+ * starts
+ */
+static GTM_ThreadInfo *
+MainThreadInit()
+{
+ GTM_ThreadInfo *thrinfo;
+
+ pthread_key_create(&threadinfo_key, NULL);
+
+ /*
+ * Initialize the lock protecting the global threads info
+ */
+ GTM_RWLockInit(&GTMThreads->gt_lock);
+
+ /*
+ * We are called even before memory context management is setup. We must
+ * use malloc
+ */
+ thrinfo = (GTM_ThreadInfo *)malloc(sizeof (GTM_ThreadInfo));
+
+ if (thrinfo == NULL)
+ {
+ fprintf(stderr, "malloc failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+
+ if (SetMyThreadInfo(thrinfo))
+ {
+ fprintf(stderr, "SetMyThreadInfo failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+
+ return thrinfo;
+}
+
+static void
+BaseInit()
+{
+ GTM_ThreadInfo *thrinfo;
+
+ thrinfo = MainThreadInit();
+
+ MyThreadID = pthread_self();
+
+ MemoryContextInit();
+
+ checkDataDir();
+ ChangeToDataDir();
+ CreateDataDirLockFile();
+
+ sprintf(GTMControlFile, "%s/%s", GTMDataDir, GTM_CONTROL_FILE);
+ if (GTMLogFile == NULL)
+ {
+ GTMLogFile = (char *) malloc(GTM_MAX_PATH);
+ sprintf(GTMLogFile, "%s/%s", GTMDataDir, GTM_LOG_FILE);
+ }
+
+ DebugFileOpen();
+
+ GTM_InitTxnManager();
+ GTM_InitSeqManager();
+
+ /*
+ * The memory context is now set up.
+ * Add the thrinfo structure in the global array
+ */
+ if (GTM_ThreadAdd(thrinfo) == -1)
+ {
+ fprintf(stderr, "GTM_ThreadAdd for main thread failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+}
+
+static void
+GTM_SigleHandler(int signal)
+{
+ fprintf(stderr, "Received signal %d", signal);
+
+ switch (signal)
+ {
+ case SIGKILL:
+ case SIGTERM:
+ case SIGQUIT:
+ case SIGINT:
+ case SIGHUP:
+ break;
+
+ default:
+ fprintf(stderr, "Unknown signal %d\n", signal);
+ return;
+ }
+
+ /*
+ * XXX We should do a clean shutdown here.
+ */
+ /* Delete pid file before shutting down */
+ DeleteLockFile(GTM_PID_FILE);
+
+ PG_SETMASK(&BlockSig);
+ GTMAbortPending = true;
+
+ return;
+}
+
+/*
+ * Help display should match
+ */
+static void
+help(const char *progname)
+{
+ printf(_("This is the GTM server.\n\n"));
+ printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
+ printf(_("Options:\n"));
+ printf(_(" -h hostname GTM server hostname/IP\n"));
+ printf(_(" -p port GTM server port number\n"));
+ printf(_(" -x xid Starting GXID \n"));
+ printf(_(" -D directory GTM working directory\n"));
+ printf(_(" -l filename GTM server log file name \n"));
+ printf(_(" --help show this help, then exit\n"));
+}
+
+int
+main(int argc, char *argv[])
+{
+ int opt;
+ int status;
+ int i;
+ GlobalTransactionId next_gxid = InvalidGlobalTransactionId;
+ int ctlfd;
+
+ /*
+ * Catch standard options before doing much else
+ */
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+ {
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ ListenAddresses = GTM_DEFAULT_HOSTNAME;
+ GTMPortNumber = GTM_DEFAULT_PORT;
+
+ /*
+ * Parse the command like options and set variables
+ */
+ while ((opt = getopt(argc, argv, "h:p:x:D:l:")) != -1)
+ {
+ switch (opt)
+ {
+ case 'h':
+ ListenAddresses = strdup(optarg);
+ break;
+
+ case 'p':
+ GTMPortNumber = atoi(optarg);
+ break;
+
+ case 'x':
+ next_gxid = (GlobalTransactionId )atoll(optarg);
+ break;
+
+ case 'D':
+ GTMDataDir = strdup(optarg);
+ canonicalize_path(GTMDataDir);
+ break;
+
+ case 'l':
+ GTMLogFile = strdup(optarg);
+ break;
+
+ default:
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ }
+ }
+
+ if (GTMDataDir == NULL)
+ {
+ write_stderr("GTM data directory must be specified\n");
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+ /*
+ * GTM accepts no non-option switch arguments.
+ */
+ if (optind < argc)
+ {
+ write_stderr("%s: invalid argument: \"%s\"\n",
+ progname, argv[optind]);
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+
+ /*
+ * Some basic initialization must happen before we do anything
+ * useful
+ */
+ BaseInit();
+
+ elog(DEBUG3, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile);
+
+ /*
+ * Read the last GXID and start from there
+ */
+
+ ctlfd = open(GTMControlFile, O_RDONLY);
+
+ GTM_RestoreTxnInfo(ctlfd, next_gxid);
+ GTM_RestoreSeqInfo(ctlfd);
+
+ close(ctlfd);
+ /*
+ * Establish input sockets.
+ */
+ for (i = 0; i < MAXLISTEN; i++)
+ ListenSocket[i] = -1;
+
+ if (ListenAddresses)
+ {
+ int success = 0;
+
+ status = StreamServerPort(AF_UNSPEC, ListenAddresses,
+ (unsigned short) GTMPortNumber,
+ ListenSocket, MAXLISTEN);
+ if (status == STATUS_OK)
+ success++;
+ else
+ ereport(FATAL,
+ (errmsg("could not create listen socket for \"%s\"",
+ ListenAddresses)));
+ }
+
+ /*
+ * check that we have some socket to listen on
+ */
+ if (ListenSocket[0] == -1)
+ ereport(FATAL,
+ (errmsg("no socket created for listening")));
+
+ /*
+ * Record gtm options. We delay this till now to avoid recording
+ * bogus options
+ */
+ if (!CreateOptsFile(argc, argv))
+ exit(1);
+
+ pqsignal(SIGHUP, GTM_SigleHandler);
+ pqsignal(SIGKILL, GTM_SigleHandler);
+ pqsignal(SIGQUIT, GTM_SigleHandler);
+ pqsignal(SIGTERM, GTM_SigleHandler);
+ pqsignal(SIGINT, GTM_SigleHandler);
+
+ pqinitmask();
+
+ /*
+ * Accept any new connections. Fork a new thread for each incoming
+ * connection
+ */
+ status = ServerLoop();
+
+ /*
+ * ServerLoop probably shouldn't ever return, but if it does, close down.
+ */
+ exit(status != STATUS_OK);
+
+ return 0; /* not reached */
+}
+
+/*
+ * ConnCreate -- create a local connection data structure
+ */
+static Port *
+ConnCreate(int serverFd)
+{
+ Port *port;
+
+ if (!(port = (Port *) calloc(1, sizeof(Port))))
+ {
+ ereport(LOG,
+ (ENOMEM,
+ errmsg("out of memory")));
+ exit(1);
+ }
+
+ if (StreamConnection(serverFd, port) != STATUS_OK)
+ {
+ if (port->sock >= 0)
+ StreamClose(port->sock);
+ ConnFree(port);
+ port = NULL;
+ }
+
+ port->conn_id = InvalidGTMProxyConnID;
+ return port;
+}
+
+/*
+ * ConnFree -- free a local connection data structure
+ */
+void
+ConnFree(Port *conn)
+{
+ free(conn);
+}
+
+/*
+ * Main idle loop of postmaster
+ */
+static int
+ServerLoop(void)
+{
+ fd_set readmask;
+ int nSockets;
+
+ nSockets = initMasks(&readmask);
+
+ for (;;)
+ {
+ fd_set rmask;
+ int selres;
+
+ //MemoryContextStats(TopMostMemoryContext);
+
+ /*
+ * Wait for a connection request to arrive.
+ *
+ * We wait at most one minute, to ensure that the other background
+ * tasks handled below get done even when no requests are arriving.
+ */
+ memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+ PG_SETMASK(&UnBlockSig);
+
+ if (GTMAbortPending)
+ {
+ int ctlfd;
+
+ /*
+ * XXX We should do a clean shutdown here. For the time being, just
+ * write the next GXID to be issued in the control file and exit
+ * gracefully
+ */
+
+ /*
+ * Tell GTM that we are shutting down so that no new GXIDs are
+ * issued this point onwards
+ */
+ GTM_SetShuttingDown();
+
+ ctlfd = open(GTMControlFile, O_WRONLY | O_TRUNC | O_CREAT,
+ S_IRUSR | S_IWUSR);
+ if (ctlfd == -1)
+ {
+ fprintf(stderr, "Failed to create/open the control file\n");
+ exit(2);
+ }
+
+ GTM_SaveTxnInfo(ctlfd);
+ GTM_SaveSeqInfo(ctlfd);
+
+ close(ctlfd);
+
+ exit(1);
+ }
+
+ {
+ /* must set timeout each time; some OSes change it! */
+ struct timeval timeout;
+
+ timeout.tv_sec = 60;
+ timeout.tv_usec = 0;
+
+ selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+ }
+
+ /*
+ * Block all signals until we wait again. (This makes it safe for our
+ * signal handlers to do nontrivial work.)
+ */
+ PG_SETMASK(&BlockSig);
+
+ /* Now check the select() result */
+ if (selres < 0)
+ {
+ if (errno != EINTR && errno != EWOULDBLOCK)
+ {
+ ereport(LOG,
+ (EACCES,
+ errmsg("select() failed in postmaster: %m")));
+ return STATUS_ERROR;
+ }
+ }
+
+ /*
+ * New connection pending on any of our sockets? If so, fork a child
+ * process to deal with it.
+ */
+ if (selres > 0)
+ {
+ int i;
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ if (ListenSocket[i] == -1)
+ break;
+ if (FD_ISSET(ListenSocket[i], &rmask))
+ {
+ Port *port;
+
+ port = ConnCreate(ListenSocket[i]);
+ if (port)
+ {
+ if (GTMAddConnection(port) != STATUS_OK)
+ {
+ elog(ERROR, "Too many connections");
+ StreamClose(port->sock);
+ ConnFree(port);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Initialise the masks for select() for the ports we are listening on.
+ * Return the number of sockets to listen on.
+ */
+static int
+initMasks(fd_set *rmask)
+{
+ int maxsock = -1;
+ int i;
+
+ FD_ZERO(rmask);
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ int fd = ListenSocket[i];
+
+ if (fd == -1)
+ break;
+ FD_SET(fd, rmask);
+ if (fd > maxsock)
+ maxsock = fd;
+ }
+
+ return maxsock + 1;
+}
+
+
+void *
+GTM_ThreadMain(void *argp)
+{
+ GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+ int qtype;
+ StringInfoData input_message;
+ sigjmp_buf local_sigjmp_buf;
+
+ elog(DEBUG3, "Starting the connection helper thread");
+
+
+ /*
+ * Create the memory context we will use in the main loop.
+ *
+ * MessageContext is reset once per iteration of the main loop, ie, upon
+ * completion of processing of each command message from the client.
+ *
+ * This context is thread-specific
+ */
+ MessageContext = AllocSetContextCreate(TopMemoryContext,
+ "MessageContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE,
+ false);
+
+
+ {
+ /*
+ * We expect a startup message at the very start. The message type is
+ * REGISTER_COORD, followed by the 4 byte coordinator ID
+ */
+ char startup_type;
+ GTM_StartupPacket sp;
+ StringInfoData inBuf;
+
+ startup_type = pq_getbyte(thrinfo->thr_conn->con_port);
+
+ if (startup_type != 'A')
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Expecting a startup message, but received %c",
+ startup_type)));
+
+ initStringInfo(&inBuf);
+
+ /*
+ * All frontend messages have a length word next
+ * after the type code; we can read the message contents independently of
+ * the type.
+ */
+ if (pq_getmessage(thrinfo->thr_conn->con_port, &inBuf, 0))
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Expecting coordinator ID, but received EOF")));
+
+ memcpy(&sp,
+ pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)),
+ sizeof (GTM_StartupPacket));
+ pq_getmsgend(&inBuf);
+
+ GTM_RegisterCoordinator(thrinfo->thr_conn->con_port, sp.sp_cid);
+ thrinfo->thr_conn->con_port->is_proxy = sp.sp_isproxy;
+ }
+
+ {
+ /*
+ * Send a dummy authentication request message 'R' as the client
+ * expects that in the current protocol
+ */
+ StringInfoData buf;
+ pq_beginmessage(&buf, 'R');
+ pq_endmessage(thrinfo->thr_conn->con_port, &buf);
+ pq_flush(thrinfo->thr_conn->con_port);
+
+ elog(DEBUG3, "Sent connection authentication message to the client");
+ }
+
+ /*
+ * Get the input_message in the TopMemoryContext so that we don't need to
+ * free/palloc it for every incoming message. Unlike Postgres, we don't
+ * expect the incoming messages to be of arbitrary sizes
+ */
+
+ initStringInfo(&input_message);
+
+ /*
+ * POSTGRES main processing loop begins here
+ *
+ * If an exception is encountered, processing resumes here so we abort the
+ * current transaction and start a new one.
+ *
+ * You might wonder why this isn't coded as an infinite loop around a
+ * PG_TRY construct. The reason is that this is the bottom of the
+ * exception stack, and so with PG_TRY there would be no exception handler
+ * in force at all during the CATCH part. By leaving the outermost setjmp
+ * always active, we have at least some chance of recovering from an error
+ * during error recovery. (If we get into an infinite loop thereby, it
+ * will soon be stopped by overflow of elog.c's internal state stack.)
+ */
+
+ if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+ {
+ /*
+ * NOTE: if you are tempted to add more code in this if-block,
+ * consider the high probability that it should be in
+ * AbortTransaction() instead. The only stuff done directly here
+ * should be stuff that is guaranteed to apply *only* for outer-level
+ * error recovery, such as adjusting the FE/BE protocol status.
+ */
+
+ /* Report the error to the client and/or server log */
+ if (thrinfo->thr_conn)
+ EmitErrorReport(thrinfo->thr_conn->con_port);
+ else
+ EmitErrorReport(NULL);
+
+ /*
+ * Now return to normal top-level context and clear ErrorContext for
+ * next time.
+ */
+ MemoryContextSwitchTo(TopMemoryContext);
+ FlushErrorState();
+ }
+
+ /* We can now handle ereport(ERROR) */
+ PG_exception_stack = &local_sigjmp_buf;
+
+
+ for (;;)
+ {
+ /*
+ * Release storage left over from prior query cycle, and create a new
+ * query input buffer in the cleared MessageContext.
+ */
+ MemoryContextSwitchTo(MessageContext);
+ MemoryContextResetAndDeleteChildren(MessageContext);
+
+ /*
+ * Just reset the input buffer to avoid repeated palloc/pfrees
+ *
+ * XXX We should consider resetting the MessageContext periodically to
+ * handle any memory leaks
+ */
+ resetStringInfo(&input_message);
+
+ /*
+ * (3) read a command (loop blocks here)
+ */
+ qtype = ReadCommand(thrinfo->thr_conn->con_port, &input_message);
+
+ switch(qtype)
+ {
+ case 'C':
+ ProcessCommand(thrinfo->thr_conn->con_port, &input_message);
+ break;
+
+ case 'X':
+ case EOF:
+ /*
+ * Connection termination request
+ * Remove all transactions opened within the thread
+ */
+ GTM_RemoveAllTransInfos(-1);
+ pthread_exit(thrinfo);
+ break;
+
+ case 'F':
+ /*
+ * Flush all the outgoing data on the wire. Consume the message
+ * type field for sanity
+ */
+ pq_getmsgint(&input_message, sizeof (GTM_MessageType));
+ pq_getmsgend(&input_message);
+ pq_flush(thrinfo->thr_conn->con_port);
+ break;
+
+ default:
+ /*
+ * Remove all transactions opened within the thread
+ */
+ GTM_RemoveAllTransInfos(-1);
+
+ ereport(FATAL,
+ (EPROTO,
+ errmsg("invalid frontend message type %d",
+ qtype)));
+ break;
+ }
+
+ }
+
+ /* can't get here because the above loop never exits */
+ Assert(false);
+
+ return thrinfo;
+}
+
+void
+ProcessCommand(Port *myport, StringInfo input_message)
+{
+ GTM_MessageType mtype;
+ GTM_ProxyMsgHeader proxyhdr;
+
+ if (myport->is_proxy)
+ pq_copymsgbytes(input_message, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ else
+ proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+ myport->conn_id = proxyhdr.ph_conid;
+ mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType));
+
+ switch (mtype)
+ {
+ case MSG_UNREGISTER_COORD:
+ ProcessCoordinatorCommand(myport, mtype, input_message);
+ break;
+
+ case MSG_TXN_BEGIN:
+ case MSG_TXN_BEGIN_GETGXID:
+ case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+ case MSG_TXN_PREPARE:
+ case MSG_TXN_COMMIT:
+ case MSG_TXN_ROLLBACK:
+ case MSG_TXN_GET_GXID:
+ case MSG_TXN_BEGIN_GETGXID_MULTI:
+ case MSG_TXN_COMMIT_MULTI:
+ case MSG_TXN_ROLLBACK_MULTI:
+ ProcessTransactionCommand(myport, mtype, input_message);
+ break;
+
+ case MSG_SNAPSHOT_GET:
+ case MSG_SNAPSHOT_GXID_GET:
+ case MSG_SNAPSHOT_GET_MULTI:
+ ProcessSnapshotCommand(myport, mtype, input_message);
+ break;
+
+ case MSG_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
+ case MSG_SEQUENCE_GET_NEXT:
+ case MSG_SEQUENCE_RESET:
+ case MSG_SEQUENCE_CLOSE:
+ ProcessSeqeunceCommand(myport, mtype, input_message);
+ break;
+
+ case MSG_TXN_GET_STATUS:
+ case MSG_TXN_GET_ALL_PREPARED:
+ ProcessQueryCommand(myport, mtype, input_message);
+ break;
+
+ case MSG_BACKEND_DISCONNECT:
+ GTM_RemoveAllTransInfos(proxyhdr.ph_conid);
+ break;
+
+ default:
+ ereport(FATAL,
+ (EPROTO,
+ errmsg("invalid frontend message type %d",
+ mtype)));
+ }
+}
+
+static int
+GTMAddConnection(Port *port)
+{
+ GTM_ConnectionInfo *conninfo = NULL;
+
+ conninfo = (GTM_ConnectionInfo *)palloc(sizeof (GTM_ConnectionInfo));
+
+ if (conninfo == NULL)
+ {
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("Out of memory")));
+ return STATUS_ERROR;
+ }
+
+ elog(DEBUG3, "Started new connection");
+ conninfo->con_port = port;
+
+ /*
+ * XXX Start the thread
+ */
+ if (GTM_ThreadCreate(conninfo, GTM_ThreadMain) == NULL)
+ {
+ elog(ERROR, "failed to create a new thread");
+ return STATUS_ERROR;
+ }
+
+ return STATUS_OK;
+}
+
+/* ----------------
+ * ReadCommand reads a command from either the frontend or
+ * standard input, places it in inBuf, and returns the
+ * message type code (first byte of the message).
+ * EOF is returned if end of file.
+ * ----------------
+ */
+static int
+ReadCommand(Port *myport, StringInfo inBuf)
+{
+ int qtype;
+
+ /*
+ * Get message type code from the frontend.
+ */
+ qtype = pq_getbyte(myport);
+
+ if (qtype == EOF) /* frontend disconnected */
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("unexpected EOF on client connection")));
+ return EOF;
+ }
+
+ /*
+ * Validate message type code before trying to read body; if we have lost
+ * sync, better to say "command unknown" than to run out of memory because
+ * we used garbage as a length word.
+ *
+ * This also gives us a place to set the doing_extended_query_message flag
+ * as soon as possible.
+ */
+ switch (qtype)
+ {
+ case 'C':
+ break;
+
+ case 'X':
+ break;
+
+ case 'F':
+ break;
+
+ default:
+
+ /*
+ * Otherwise we got garbage from the frontend. We treat this as
+ * fatal because we have probably lost message boundary sync, and
+ * there's no good way to recover.
+ */
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("invalid frontend message type %d", qtype)));
+
+ break;
+ }
+
+ /*
+ * In protocol version 3, all frontend messages have a length word next
+ * after the type code; we can read the message contents independently of
+ * the type.
+ */
+ if (pq_getmessage(myport, inBuf, 0))
+ return EOF; /* suitable message already logged */
+
+ return qtype;
+}
+
+static void
+ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+ GTM_CoordinatorId cid;
+
+ cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId));
+
+ switch (mtype)
+ {
+ case MSG_UNREGISTER_COORD:
+ GTM_UnregisterCoordinator(myport, cid);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+ pq_getmsgend(message);
+}
+
+static void
+ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+ elog(DEBUG1, "ProcessTransactionCommand: mtype:%d", mtype);
+
+ switch (mtype)
+ {
+ case MSG_TXN_BEGIN:
+ ProcessBeginTransactionCommand(myport, message);
+ break;
+
+ case MSG_TXN_BEGIN_GETGXID:
+ ProcessBeginTransactionGetGXIDCommand(myport, message);
+ break;
+
+ case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+ ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message);
+ break;
+
+ case MSG_TXN_BEGIN_GETGXID_MULTI:
+ ProcessBeginTransactionGetGXIDCommandMulti(myport, message);
+ break;
+
+ case MSG_TXN_PREPARE:
+ ProcessPrepareTransactionCommand(myport, message);
+ break;
+
+ case MSG_TXN_COMMIT:
+ ProcessCommitTransactionCommand(myport, message);
+ break;
+
+ case MSG_TXN_ROLLBACK:
+ ProcessRollbackTransactionCommand(myport, message);
+ break;
+
+ case MSG_TXN_COMMIT_MULTI:
+ ProcessCommitTransactionCommandMulti(myport, message);
+ break;
+
+ case MSG_TXN_ROLLBACK_MULTI:
+ ProcessRollbackTransactionCommandMulti(myport, message);
+ break;
+
+ case MSG_TXN_GET_GXID:
+ ProcessGetGXIDTransactionCommand(myport, message);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+}
+
+static void
+ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+ switch (mtype)
+ {
+ case MSG_SNAPSHOT_GET:
+ ProcessGetSnapshotCommand(myport, message, false);
+ break;
+
+ case MSG_SNAPSHOT_GET_MULTI:
+ ProcessGetSnapshotCommandMulti(myport, message);
+ break;
+
+ case MSG_SNAPSHOT_GXID_GET:
+ ProcessGetSnapshotCommand(myport, message, true);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+
+}
+
+static void
+ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+ switch (mtype)
+ {
+ case MSG_SEQUENCE_INIT:
+ ProcessSequenceInitCommand(myport, message);
+ break;
+
+ case MSG_SEQUENCE_GET_CURRENT:
+ ProcessSequenceGetCurrentCommand(myport, message);
+ break;
+
+ case MSG_SEQUENCE_GET_NEXT:
+ ProcessSequenceGetNextCommand(myport, message);
+ break;
+
+ case MSG_SEQUENCE_RESET:
+ ProcessSequenceResetCommand(myport, message);
+ break;
+
+ case MSG_SEQUENCE_CLOSE:
+ ProcessSequenceCloseCommand(myport, message);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+
+}
+
+static void
+ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+ switch (mtype)
+ {
+ case MSG_TXN_GET_STATUS:
+ case MSG_TXN_GET_ALL_PREPARED:
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+
+}
+
+static void
+GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId cid)
+{
+ elog(DEBUG3, "Registering coordinator with cid %d", cid);
+ myport->coordinator_id = cid;
+}
+
+
+static void
+GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId cid)
+{
+ /*
+ * Do a clean shutdown
+ */
+ return;
+}
+
+/*
+ * Validate the proposed data directory
+ */
+static void
+checkDataDir(void)
+{
+ struct stat stat_buf;
+
+ Assert(GTMDataDir);
+
+retry:
+ if (stat(GTMDataDir, &stat_buf) != 0)
+ {
+ if (errno == ENOENT)
+ {
+ if (mkdir(GTMDataDir, 0700) != 0)
+ {
+ ereport(FATAL,
+ (errno,
+ errmsg("failed to create the directory \"%s\"",
+ GTMDataDir)));
+ }
+ goto retry;
+ }
+ else
+ ereport(FATAL,
+ (EPERM,
+ errmsg("could not read permissions of directory \"%s\": %m",
+ GTMDataDir)));
+ }
+
+ /* eventual chdir would fail anyway, but let's test ... */
+ if (!S_ISDIR(stat_buf.st_mode))
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("specified data directory \"%s\" is not a directory",
+ GTMDataDir)));
+
+ /*
+ * Check that the directory belongs to my userid; if not, reject.
+ *
+ * This check is an essential part of the interlock that prevents two
+ * postmasters from starting in the same directory (see CreateLockFile()).
+ * Do not remove or weaken it.
+ *
+ * XXX can we safely enable this check on Windows?
+ */
+#if !defined(WIN32) && !defined(__CYGWIN__)
+ if (stat_buf.st_uid != geteuid())
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("data directory \"%s\" has wrong ownership",
+ GTMDataDir),
+ errhint("The server must be started by the user that owns the data directory.")));
+#endif
+}
+
+/*
+ * Change working directory to DataDir. Most of the postmaster and backend
+ * code assumes that we are in DataDir so it can use relative paths to access
+ * stuff in and under the data directory. For convenience during path
+ * setup, however, we don't force the chdir to occur during SetDataDir.
+ */
+static void
+ChangeToDataDir(void)
+{
+ if (chdir(GTMDataDir) < 0)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not change directory to \"%s\": %m",
+ GTMDataDir)));
+}
+
+/*
+ * Create the data directory lockfile.
+ *
+ * When this is called, we must have already switched the working
+ * directory to DataDir, so we can just use a relative path. This
+ * helps ensure that we are locking the directory we should be.
+ */
+static void
+CreateDataDirLockFile()
+{
+ CreateLockFile(GTM_PID_FILE, GTMDataDir);
+}
+
+/*
+ * Create a lockfile.
+ *
+ * filename is the name of the lockfile to create.
+ * amPostmaster is used to determine how to encode the output PID.
+ * isDDLock and refName are used to determine what error message to produce.
+ */
+static void
+CreateLockFile(const char *filename, const char *refName)
+{
+ int fd;
+ char buffer[MAXPGPATH + 100];
+ int ntries;
+ int len;
+ int encoded_pid;
+ pid_t other_pid;
+ pid_t my_pid = getpid();
+
+ /*
+ * We need a loop here because of race conditions. But don't loop forever
+ * (for example, a non-writable $PGDATA directory might cause a failure
+ * that won't go away). 100 tries seems like plenty.
+ */
+ for (ntries = 0;; ntries++)
+ {
+ /*
+ * Try to create the lock file --- O_EXCL makes this atomic.
+ *
+ * Think not to make the file protection weaker than 0600. See
+ * comments below.
+ */
+ fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600);
+ if (fd >= 0)
+ break; /* Success; exit the retry loop */
+
+ /*
+ * Couldn't create the pid file. Probably it already exists.
+ */
+ if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not create lock file \"%s\": %m",
+ filename)));
+
+ /*
+ * Read the file to get the old owner's PID. Note race condition
+ * here: file might have been deleted since we tried to create it.
+ */
+ fd = open(filename, O_RDONLY, 0600);
+ if (fd < 0)
+ {
+ if (errno == ENOENT)
+ continue; /* race condition; try again */
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not open lock file \"%s\": %m",
+ filename)));
+ }
+ if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not read lock file \"%s\": %m",
+ filename)));
+ close(fd);
+
+ buffer[len] = '\0';
+ encoded_pid = atoi(buffer);
+ other_pid = (pid_t) encoded_pid;
+
+ if (other_pid <= 0)
+ elog(FATAL, "bogus data in lock file \"%s\": \"%s\"",
+ filename, buffer);
+
+ /*
+ * Check to see if the other process still exists
+ *
+ * If the PID in the lockfile is our own PID or our parent's PID, then
+ * the file must be stale (probably left over from a previous system
+ * boot cycle). We need this test because of the likelihood that a
+ * reboot will assign exactly the same PID as we had in the previous
+ * reboot. Also, if there is just one more process launch in this
+ * reboot than in the previous one, the lockfile might mention our
+ * parent's PID. We can reject that since we'd never be launched
+ * directly by a competing postmaster. We can't detect grandparent
+ * processes unfortunately, but if the init script is written
+ * carefully then all but the immediate parent shell will be
+ * root-owned processes and so the kill test will fail with EPERM.
+ *
+ * We can treat the EPERM-error case as okay because that error
+ * implies that the existing process has a different userid than we
+ * do, which means it cannot be a competing postmaster. A postmaster
+ * cannot successfully attach to a data directory owned by a userid
+ * other than its own. (This is now checked directly in
+ * checkDataDir(), but has been true for a long time because of the
+ * restriction that the data directory isn't group- or
+ * world-accessible.) Also, since we create the lockfiles mode 600,
+ * we'd have failed above if the lockfile belonged to another userid
+ * --- which means that whatever process kill() is reporting about
+ * isn't the one that made the lockfile. (NOTE: this last
+ * consideration is the only one that keeps us from blowing away a
+ * Unix socket file belonging to an instance of Postgres being run by
+ * someone else, at least on machines where /tmp hasn't got a
+ * stickybit.)
+ *
+ * Windows hasn't got getppid(), but doesn't need it since it's not
+ * using real kill() either...
+ *
+ * Normally kill() will fail with ESRCH if the given PID doesn't
+ * exist.
+ */
+ if (other_pid != my_pid
+#ifndef WIN32
+ && other_pid != getppid()
+#endif
+ )
+ {
+ if (kill(other_pid, 0) == 0 ||
+ (errno != ESRCH && errno != EPERM))
+ {
+ /* lockfile belongs to a live process */
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("lock file \"%s\" already exists",
+ filename),
+ errhint("Is another GTM (PID %d) running in data directory \"%s\"?",
+ (int) other_pid, refName)));
+ }
+ }
+
+ /*
+ * Looks like nobody's home. Unlink the file and try again to create
+ * it. Need a loop because of possible race condition against other
+ * would-be creators.
+ */
+ if (unlink(filename) < 0)
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not remove old lock file \"%s\": %m",
+ filename),
+ errhint("The file seems accidentally left over, but "
+ "it could not be removed. Please remove the file "
+ "by hand and try again.")));
+ }
+
+ /*
+ * Successfully created the file, now fill it.
+ */
+ snprintf(buffer, sizeof(buffer), "%d\n%s\n",
+ (int) my_pid, GTMDataDir);
+ errno = 0;
+ if (write(fd, buffer, strlen(buffer)) != strlen(buffer))
+ {
+ int save_errno = errno;
+
+ close(fd);
+ unlink(filename);
+ /* if write didn't set errno, assume problem is no disk space */
+ errno = save_errno ? save_errno : ENOSPC;
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not write lock file \"%s\": %m", filename)));
+ }
+ if (close(fd))
+ {
+ int save_errno = errno;
+
+ unlink(filename);
+ errno = save_errno;
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not write lock file \"%s\": %m", filename)));
+ }
+}
+
+/*
+ * Create the opts file
+ */
+static bool
+CreateOptsFile(int argc, char *argv[])
+{
+ FILE *fp;
+ int i;
+
+#define OPTS_FILE "gtm.opts"
+
+ if ((fp = fopen(OPTS_FILE, "w")) == NULL)
+ {
+ elog(LOG, "could not create file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ for (i = 1; i < argc; i++)
+ fprintf(fp, " \"%s\"", argv[i]);
+ fputs("\n", fp);
+
+ if (fclose(fp))
+ {
+ elog(LOG, "could not write file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ return true;
+}
+
+/* delete pid file */
+static void
+DeleteLockFile(const char *filename)
+{
+ if (unlink(filename) < 0)
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not remove old lock file \"%s\": %m",
+ filename),
+ errhint("The file seems accidentally left over, but "
+ "it could not be removed. Please remove the file "
+ "by hand and try again.")));
+}
diff --git a/src/gtm/path/Makefile b/src/gtm/path/Makefile
new file mode 100644
index 0000000000..802ae3b9f9
--- /dev/null
+++ b/src/gtm/path/Makefile
@@ -0,0 +1,21 @@
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=gtmpath
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=path.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+ rm -f $(OBJS)
+ rm -f libgtmpath.so libgtmpath.so.1 libgtmpath.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
+
diff --git a/src/gtm/path/path.c b/src/gtm/path/path.c
new file mode 100644
index 0000000000..ea0eb6dbf2
--- /dev/null
+++ b/src/gtm/path/path.c
@@ -0,0 +1,177 @@
+/*-------------------------------------------------------------------------
+ *
+ * path.c
+ * portable path handling routines
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <ctype.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gtm/path.h>
+
+#define IS_DIR_SEP(ch) ((ch) == '/' || (ch) == '\\')
+
+#define skip_drive(path) (path)
+
+static void trim_directory(char *path);
+static void trim_trailing_separator(char *path);
+
+/*
+ * Clean up path by:
+ * o remove trailing slash
+ * o remove duplicate adjacent separators
+ * o remove trailing '.'
+ * o process trailing '..' ourselves
+ */
+void
+canonicalize_path(char *path)
+{
+ char *p,
+ *to_p;
+ char *spath;
+ bool was_sep = false;
+ int pending_strips;
+
+ /*
+ * Removing the trailing slash on a path means we never get ugly double
+ * trailing slashes. Also, Win32 can't stat() a directory with a trailing
+ * slash. Don't remove a leading slash, though.
+ */
+ trim_trailing_separator(path);
+
+ /*
+ * Remove duplicate adjacent separators
+ */
+ p = path;
+
+ to_p = p;
+ for (; *p; p++, to_p++)
+ {
+ /* Handle many adjacent slashes, like "/a///b" */
+ while (*p == '/' && was_sep)
+ p++;
+ if (to_p != p)
+ *to_p = *p;
+ was_sep = (*p == '/');
+ }
+ *to_p = '\0';
+
+ /*
+ * Remove any trailing uses of "." and process ".." ourselves
+ *
+ * Note that "/../.." should reduce to just "/", while "../.." has to be
+ * kept as-is. In the latter case we put back mistakenly trimmed ".."
+ * components below. Also note that we want a Windows drive spec to be
+ * visible to trim_directory(), but it's not part of the logic that's
+ * looking at the name components; hence distinction between path and
+ * spath.
+ */
+ spath = skip_drive(path);
+ pending_strips = 0;
+ for (;;)
+ {
+ int len = strlen(spath);
+
+ if (len >= 2 && strcmp(spath + len - 2, "/.") == 0)
+ trim_directory(path);
+ else if (strcmp(spath, ".") == 0)
+ {
+ /* Want to leave "." alone, but "./.." has to become ".." */
+ if (pending_strips > 0)
+ *spath = '\0';
+ break;
+ }
+ else if ((len >= 3 && strcmp(spath + len - 3, "/..") == 0) ||
+ strcmp(spath, "..") == 0)
+ {
+ trim_directory(path);
+ pending_strips++;
+ }
+ else if (pending_strips > 0 && *spath != '\0')
+ {
+ /* trim a regular directory name cancelled by ".." */
+ trim_directory(path);
+ pending_strips--;
+ /* foo/.. should become ".", not empty */
+ if (*spath == '\0')
+ strcpy(spath, ".");
+ }
+ else
+ break;
+ }
+
+ if (pending_strips > 0)
+ {
+ /*
+ * We could only get here if path is now totally empty (other than a
+ * possible drive specifier on Windows). We have to put back one or
+ * more ".."'s that we took off.
+ */
+ while (--pending_strips > 0)
+ strcat(path, "../");
+ strcat(path, "..");
+ }
+}
+
+/*
+ * trim_directory
+ *
+ * Trim trailing directory from path, that is, remove any trailing slashes,
+ * the last pathname component, and the slash just ahead of it --- but never
+ * remove a leading slash.
+ */
+static void
+trim_directory(char *path)
+{
+ char *p;
+
+ path = skip_drive(path);
+
+ if (path[0] == '\0')
+ return;
+
+ /* back up over trailing slash(es) */
+ for (p = path + strlen(path) - 1; IS_DIR_SEP(*p) && p > path; p--)
+ ;
+ /* back up over directory name */
+ for (; !IS_DIR_SEP(*p) && p > path; p--)
+ ;
+ /* if multiple slashes before directory name, remove 'em all */
+ for (; p > path && IS_DIR_SEP(*(p - 1)); p--)
+ ;
+ /* don't erase a leading slash */
+ if (p == path && IS_DIR_SEP(*p))
+ p++;
+ *p = '\0';
+}
+
+/*
+ * trim_trailing_separator
+ *
+ * trim off trailing slashes, but not a leading slash
+ */
+static void
+trim_trailing_separator(char *path)
+{
+ char *p;
+
+ path = skip_drive(path);
+ p = path + strlen(path);
+ if (p > path)
+ for (p--; p > path && IS_DIR_SEP(*p); p--)
+ *p = '\0';
+}
diff --git a/src/gtm/proxy/Makefile b/src/gtm/proxy/Makefile
new file mode 100644
index 0000000000..3ed6ccce13
--- /dev/null
+++ b/src/gtm/proxy/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=proxy_main.o proxy_thread.o ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm_proxy:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_proxy
+
+all:gtm_proxy
+
+clean:
+ rm -f $(OBJS)
+ rm -f gtm_proxy
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
new file mode 100644
index 0000000000..75c7baf063
--- /dev/null
+++ b/src/gtm/proxy/proxy_main.c
@@ -0,0 +1,2016 @@
+/*-------------------------------------------------------------------------
+ *
+ * proxy_main.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_proxy.h"
+#include "gtm/elog.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/pqsignal.h"
+#include "gtm/pqformat.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-int.h"
+
+extern int optind;
+extern char *optarg;
+
+#define GTM_MAX_PATH 1024
+#define GTM_PROXY_DEFAULT_HOSTNAME "*"
+#define GTM_PROXY_DEFAULT_PORT 6666
+#define GTM_PROXY_DEFAULT_WORKERS 2
+#define GTM_PID_FILE "gtm_proxy.pid"
+#define GTM_LOG_FILE "gtm_proxy.log"
+
+static char *progname = "gtm_proxy";
+char *ListenAddresses;
+int GTMProxyPortNumber;
+int GTMProxyWorkerThreads;
+char *GTMProxyDataDir;
+
+char *GTMServerHost;
+int GTMServerPortNumber;
+
+/* The socket(s) we're listening to. */
+#define MAXLISTEN 64
+static int ListenSocket[MAXLISTEN];
+
+pthread_key_t threadinfo_key;
+static bool GTMProxyAbortPending = false;
+
+static Port *ConnCreate(int serverFd);
+static void ConnFree(Port *conn);
+static int ServerLoop(void);
+static int initMasks(fd_set *rmask);
+void *GTMProxy_ThreadMain(void *argp);
+static int GTMProxyAddConnection(Port *port);
+static int ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf);
+static void GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo);
+static void GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn);
+
+static void GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+
+static void ProcessCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, StringInfo input_message);
+static void ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo,
+ GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+
+static void GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo,
+ GTM_CoordinatorId coordinator_id);
+static void GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo,
+ GTM_CoordinatorId coordinator_id);
+
+static void ProcessResponse(GTMProxy_ThreadInfo *thrinfo,
+ GTMProxy_CommandInfo *cmdinfo, GTM_Result *res);
+
+static void GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo);
+static void GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo,
+ GTM_MessageType mtype, GTMProxy_CommandData cmd_data);
+
+static bool CreateOptsFile(int argc, char *argv[]);
+static void CreateDataDirLockFile(void);
+static void CreateLockFile(const char *filename, const char *refName);
+static void ChangeToDataDir(void);
+static void checkDataDir(void);
+static void DeleteLockFile(const char *filename);
+
+/*
+ * One-time initialization. It's called immediately after the main process
+ * starts
+ */
+static GTMProxy_ThreadInfo *
+MainThreadInit()
+{
+ GTMProxy_ThreadInfo *thrinfo;
+
+ pthread_key_create(&threadinfo_key, NULL);
+
+ /*
+ * Initialize the lock protecting the global threads info
+ */
+ GTM_RWLockInit(&GTMProxyThreads->gt_lock);
+
+ /*
+ * We are called even before memory context management is setup. We must
+ * use malloc
+ */
+ thrinfo = (GTMProxy_ThreadInfo *)malloc(sizeof (GTMProxy_ThreadInfo));
+
+ if (thrinfo == NULL)
+ {
+ fprintf(stderr, "malloc failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+
+ if (SetMyThreadInfo(thrinfo))
+ {
+ fprintf(stderr, "SetMyThreadInfo failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+
+ return thrinfo;
+}
+
+static void
+BaseInit()
+{
+ GTMProxy_ThreadInfo *thrinfo;
+
+ thrinfo = MainThreadInit();
+
+ MyThreadID = pthread_self();
+
+ MemoryContextInit();
+
+ checkDataDir();
+ ChangeToDataDir();
+ CreateDataDirLockFile();
+
+ if (GTMLogFile == NULL)
+ {
+ GTMLogFile = (char *) malloc(GTM_MAX_PATH);
+ sprintf(GTMLogFile, "%s/%s", GTMProxyDataDir, GTM_LOG_FILE);
+ }
+
+ DebugFileOpen();
+
+ /*
+ * The memory context is now set up.
+ * Add the thrinfo structure in the global array
+ */
+ if (GTMProxy_ThreadAdd(thrinfo) == -1)
+ {
+ fprintf(stderr, "GTMProxy_ThreadAdd for main thread failed: %d", errno);
+ fflush(stdout);
+ fflush(stderr);
+ }
+}
+
+static void
+GTMProxy_SigleHandler(int signal)
+{
+ fprintf(stderr, "Received signal %d", signal);
+
+ switch (signal)
+ {
+ case SIGKILL:
+ case SIGTERM:
+ case SIGQUIT:
+ case SIGINT:
+ case SIGHUP:
+ break;
+
+ default:
+ fprintf(stderr, "Unknown signal %d\n", signal);
+ return;
+ }
+
+ /*
+ * XXX We should do a clean shutdown here.
+ */
+ /* Delete pid file before shutting down */
+ DeleteLockFile(GTM_PID_FILE);
+
+ PG_SETMASK(&BlockSig);
+ GTMProxyAbortPending = true;
+
+ return;
+}
+
+/*
+ * Help display should match
+ */
+static void
+help(const char *progname)
+{
+ printf(_("This is the GTM proxy.\n\n"));
+ printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
+ printf(_("Options:\n"));
+ printf(_(" -h hostname GTM proxy hostname/IP\n"));
+ printf(_(" -p port GTM proxy port number\n"));
+ printf(_(" -s hostname GTM server hostname/IP \n"));
+ printf(_(" -t port GTM server port number\n"));
+ printf(_(" -n count Number of worker threads\n"));
+ printf(_(" -D directory GTM proxy working directory\n"));
+ printf(_(" -l filename GTM proxy log file name \n"));
+ printf(_(" --help show this help, then exit\n"));
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ int opt;
+ int status;
+ int i;
+
+ /*
+ * Catch standard options before doing much else
+ */
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+ {
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ ListenAddresses = GTM_PROXY_DEFAULT_HOSTNAME;
+ GTMProxyPortNumber = GTM_PROXY_DEFAULT_PORT;
+ GTMProxyWorkerThreads = GTM_PROXY_DEFAULT_WORKERS;
+
+ /*
+ * Parse the command like options and set variables
+ */
+ while ((opt = getopt(argc, argv, "h:p:n:D:l:s:t:")) != -1)
+ {
+ switch (opt)
+ {
+ case 'h':
+ /* Listen address of the proxy */
+ ListenAddresses = strdup(optarg);
+ break;
+
+ case 'p':
+ /* Port number for the proxy to listen on */
+ GTMProxyPortNumber = atoi(optarg);
+ break;
+
+ case 'n':
+ /* Number of worker threads */
+ GTMProxyWorkerThreads = atoi(optarg);
+ break;
+
+ case 'D':
+ GTMProxyDataDir = strdup(optarg);
+ canonicalize_path(GTMProxyDataDir);
+ break;
+
+ case 'l':
+ /* The log file */
+ GTMLogFile = strdup(optarg);
+ break;
+
+ case 's':
+ /* GTM server host name */
+ GTMServerHost = strdup(optarg);
+ break;
+
+ case 't':
+ /* GTM server port number */
+ GTMServerPortNumber = atoi(optarg);
+ break;
+
+ default:
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ }
+ }
+
+ if (GTMProxyDataDir == NULL)
+ {
+ write_stderr("GTM Proxy data directory must be specified\n");
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+ /*
+ * GTM accepts no non-option switch arguments.
+ */
+ if (optind < argc)
+ {
+ write_stderr("%s: invalid argument: \"%s\"\n",
+ progname, argv[optind]);
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+
+ /*
+ * Some basic initialization must happen before we do anything
+ * useful
+ */
+ BaseInit();
+
+ elog(DEBUG3, "Starting GTM proxy at (%s:%d)", ListenAddresses, GTMProxyPortNumber);
+
+ /*
+ * Establish input sockets.
+ */
+ for (i = 0; i < MAXLISTEN; i++)
+ ListenSocket[i] = -1;
+
+ if (ListenAddresses)
+ {
+ int success = 0;
+
+ status = StreamServerPort(AF_UNSPEC, ListenAddresses,
+ (unsigned short) GTMProxyPortNumber,
+ ListenSocket, MAXLISTEN);
+ if (status == STATUS_OK)
+ success++;
+ else
+ ereport(FATAL,
+ (errmsg("could not create listen socket for \"%s\"",
+ ListenAddresses)));
+ }
+
+ /*
+ * check that we have some socket to listen on
+ */
+ if (ListenSocket[0] == -1)
+ ereport(FATAL,
+ (errmsg("no socket created for listening")));
+
+ /*
+ * Record gtm proxy options. We delay this till now to avoid recording
+ * bogus options
+ */
+ if (!CreateOptsFile(argc, argv))
+ exit(1);
+
+ pqsignal(SIGHUP, GTMProxy_SigleHandler);
+ pqsignal(SIGKILL, GTMProxy_SigleHandler);
+ pqsignal(SIGQUIT, GTMProxy_SigleHandler);
+ pqsignal(SIGTERM, GTMProxy_SigleHandler);
+ pqsignal(SIGINT, GTMProxy_SigleHandler);
+
+ pqinitmask();
+
+ /*
+ * Pre-fork so many worker threads
+ */
+
+ for (i = 0; i < GTMProxyWorkerThreads; i++)
+ {
+ /*
+ * XXX Start the worker thread
+ */
+ if (GTMProxy_ThreadCreate(GTMProxy_ThreadMain) == NULL)
+ {
+ elog(ERROR, "failed to create a new thread");
+ return STATUS_ERROR;
+ }
+ }
+
+ /*
+ * Accept any new connections. Add for each incoming connection to one of
+ * the pre-forked threads.
+ */
+ status = ServerLoop();
+
+ /*
+ * ServerLoop probably shouldn't ever return, but if it does, close down.
+ */
+ exit(status != STATUS_OK);
+
+ return 0; /* not reached */
+}
+
+/*
+ * ConnCreate -- create a local connection data structure
+ */
+static Port *
+ConnCreate(int serverFd)
+{
+ Port *port;
+
+ if (!(port = (Port *) calloc(1, sizeof(Port))))
+ {
+ ereport(LOG,
+ (ENOMEM,
+ errmsg("out of memory")));
+ exit(1);
+ }
+
+ if (StreamConnection(serverFd, port) != STATUS_OK)
+ {
+ if (port->sock >= 0)
+ StreamClose(port->sock);
+ ConnFree(port);
+ port = NULL;
+ }
+
+ port->conn_id = InvalidGTMProxyConnID;
+
+ return port;
+}
+
+/*
+ * ConnFree -- free a local connection data structure
+ */
+static void
+ConnFree(Port *conn)
+{
+ free(conn);
+}
+
+/*
+ * Main idle loop of postmaster
+ */
+static int
+ServerLoop(void)
+{
+ fd_set readmask;
+ int nSockets;
+
+ nSockets = initMasks(&readmask);
+
+ for (;;)
+ {
+ fd_set rmask;
+ int selres;
+
+ /*
+ * Wait for a connection request to arrive.
+ *
+ * We wait at most one minute, to ensure that the other background
+ * tasks handled below get done even when no requests are arriving.
+ */
+ memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+ PG_SETMASK(&UnBlockSig);
+
+ if (GTMProxyAbortPending)
+ {
+ /*
+ * Tell everybody that we are shutting down
+ *
+ * !! TODO
+ */
+ exit(1);
+ }
+
+ {
+ /* must set timeout each time; some OSes change it! */
+ struct timeval timeout;
+
+ timeout.tv_sec = 60;
+ timeout.tv_usec = 0;
+
+ selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+ }
+
+ /*
+ * Block all signals until we wait again. (This makes it safe for our
+ * signal handlers to do nontrivial work.)
+ */
+ PG_SETMASK(&BlockSig);
+
+ /* Now check the select() result */
+ if (selres < 0)
+ {
+ if (errno != EINTR && errno != EWOULDBLOCK)
+ {
+ ereport(LOG,
+ (EACCES,
+ errmsg("select() failed in postmaster: %m")));
+ return STATUS_ERROR;
+ }
+ }
+
+ /*
+ * New connection pending on any of our sockets? If so, accept the
+ * connection and add it to one of the worker threads.
+ */
+ if (selres > 0)
+ {
+ int i;
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ if (ListenSocket[i] == -1)
+ break;
+ if (FD_ISSET(ListenSocket[i], &rmask))
+ {
+ Port *port;
+
+ port = ConnCreate(ListenSocket[i]);
+ if (port)
+ {
+ if (GTMProxyAddConnection(port) != STATUS_OK)
+ {
+ elog(ERROR, "Too many connections");
+ StreamClose(port->sock);
+ ConnFree(port);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Initialise the masks for select() for the ports we are listening on.
+ * Return the number of sockets to listen on.
+ */
+static int
+initMasks(fd_set *rmask)
+{
+ int maxsock = -1;
+ int i;
+
+ FD_ZERO(rmask);
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ int fd = ListenSocket[i];
+
+ if (fd == -1)
+ break;
+ FD_SET(fd, rmask);
+ if (fd > maxsock)
+ maxsock = fd;
+ }
+
+ return maxsock + 1;
+}
+
+/*
+ * The main worker thread routine
+ */
+void *
+GTMProxy_ThreadMain(void *argp)
+{
+ GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+ int qtype;
+ StringInfoData input_message;
+ sigjmp_buf local_sigjmp_buf;
+ int32 saved_seqno = -1;
+ int ii, nrfds;
+ char gtm_connect_string[1024];
+
+ elog(DEBUG3, "Starting the connection helper thread");
+
+
+ /*
+ * Create the memory context we will use in the main loop.
+ *
+ * MessageContext is reset once per iteration of the main loop, ie, upon
+ * completion of processing of each command message from the client.
+ *
+ * This context is thread-specific
+ */
+ MessageContext = AllocSetContextCreate(TopMemoryContext,
+ "MessageContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE,
+ false);
+
+ /*
+ * Set up connection with the GTM server
+ */
+ sprintf(gtm_connect_string, "host=%s port=%d coordinator_id=1 proxy=1",
+ GTMServerHost, GTMServerPortNumber);
+
+ thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string);
+
+ if (thrinfo->thr_gtm_conn == NULL)
+ elog(FATAL, "GTM connection failed");
+
+ /*
+ * Get the input_message in the TopMemoryContext so that we don't need to
+ * free/palloc it for every incoming message. Unlike Postgres, we don't
+ * expect the incoming messages to be of arbitrary sizes
+ */
+
+ initStringInfo(&input_message);
+
+ /*
+ * If an exception is encountered, processing resumes here so we abort the
+ * current transaction and start a new one.
+ *
+ * You might wonder why this isn't coded as an infinite loop around a
+ * PG_TRY construct. The reason is that this is the bottom of the
+ * exception stack, and so with PG_TRY there would be no exception handler
+ * in force at all during the CATCH part. By leaving the outermost setjmp
+ * always active, we have at least some chance of recovering from an error
+ * during error recovery. (If we get into an infinite loop thereby, it
+ * will soon be stopped by overflow of elog.c's internal state stack.)
+ */
+
+ if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+ {
+ /*
+ * NOTE: if you are tempted to add more code in this if-block,
+ * consider the high probability that it should be in
+ * AbortTransaction() instead. The only stuff done directly here
+ * should be stuff that is guaranteed to apply *only* for outer-level
+ * error recovery, such as adjusting the FE/BE protocol status.
+ */
+
+ /* Report the error to the client and/or server log */
+ if (thrinfo->thr_conn_count > 0)
+ {
+ for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+ {
+ GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+ /*
+ * Now clean up disconnected connections
+ */
+ if (conninfo->con_disconnected)
+ {
+ GTMProxy_ThreadRemoveConnection(thrinfo, conninfo);
+ pfree(conninfo);
+ ii--;
+ }
+ else
+ {
+ /*
+ * Consume all the pending data on this connection and send
+ * error report
+ */
+ if (conninfo->con_pending_msg != MSG_TYPE_INVALID)
+ {
+ conninfo->con_port->PqRecvPointer = conninfo->con_port->PqRecvLength = 0;
+ conninfo->con_pending_msg = MSG_TYPE_INVALID;
+ EmitErrorReport(conninfo->con_port);
+ }
+ }
+ }
+ }
+ else
+ EmitErrorReport(NULL);
+
+ /*
+ * Now return to normal top-level context and clear ErrorContext for
+ * next time.
+ */
+ MemoryContextSwitchTo(TopMemoryContext);
+ FlushErrorState();
+ }
+
+ /* We can now handle ereport(ERROR) */
+ PG_exception_stack = &local_sigjmp_buf;
+
+ for (;;)
+ {
+ ListCell *elem = NULL;
+ GTM_Result *res = NULL;
+
+ /*
+ * Release storage left over from prior query cycle, and create a new
+ * query input buffer in the cleared MessageContext.
+ */
+ MemoryContextSwitchTo(MessageContext);
+ MemoryContextResetAndDeleteChildren(MessageContext);
+
+ /*
+ * Just reset the input buffer to avoid repeated palloc/pfrees
+ *
+ * XXX We should consider resetting the MessageContext periodically to
+ * handle any memory leaks
+ */
+ resetStringInfo(&input_message);
+
+ /*
+ * Check if there are any changes to the connection array assigned to
+ * this thread. If so, we need to rebuild the fd array.
+ */
+ GTM_MutexLockAcquire(&thrinfo->thr_lock);
+ if (saved_seqno != thrinfo->thr_seqno)
+ {
+ saved_seqno = thrinfo->thr_seqno;
+
+ while (thrinfo->thr_conn_count <= 0)
+ {
+ /*
+ * No connections assigned to the thread. Wait for at least one
+ * connection to be assgined to us
+ */
+ GTM_CVWait(&thrinfo->thr_cv, &thrinfo->thr_lock);
+ }
+
+ memset(thrinfo->thr_poll_fds, 0, sizeof (thrinfo->thr_poll_fds));
+
+ /*
+ * Now grab all the open connections. We are holding the lock so no
+ * new connections can be added.
+ */
+ for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+ {
+ GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+
+ /* We detect if the connection has been dropped to avoid
+ * a segmentation fault.
+ */
+ if (conninfo->con_port == NULL)
+ {
+ conninfo->con_disconnected = true;
+ continue;
+ }
+
+ /*
+ * If this is a newly added connection, complete the handshake
+ */
+ if (!conninfo->con_authenticated)
+ GTMProxy_HandshakeConnection(conninfo);
+
+ thrinfo->thr_poll_fds[ii].fd = conninfo->con_port->sock;
+ thrinfo->thr_poll_fds[ii].events = POLLIN;
+ thrinfo->thr_poll_fds[ii].revents = 0;
+ }
+ }
+ GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+ while (true)
+ {
+ nrfds = poll(thrinfo->thr_poll_fds, thrinfo->thr_conn_count, 1000);
+
+ if (nrfds < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ elog(FATAL, "poll returned with error %d", nrfds);
+ }
+ else
+ break;
+ }
+
+ if (nrfds == 0)
+ continue;
+
+ /*
+ * Initialize the lists
+ */
+ thrinfo->thr_processed_commands = NIL;
+ memset(thrinfo->thr_pending_commands, 0, sizeof (thrinfo->thr_pending_commands));
+
+ /*
+ * Now, read command from each of the connections that has some data to
+ * be read.
+ */
+ for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+ {
+ GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+ thrinfo->thr_conn = conninfo;
+
+ if (thrinfo->thr_poll_fds[ii].revents & POLLHUP)
+ {
+ /*
+ * The fd has become invalid. The connection is broken. Add it
+ * to the remove_list and cleanup at the end of this round of
+ * cleanup.
+ */
+ GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+ continue;
+ }
+
+ if (thrinfo->thr_poll_fds[ii].revents & POLLIN)
+ {
+ /*
+ * (3) read a command (loop blocks here)
+ */
+ qtype = ReadCommand(thrinfo->thr_conn, &input_message);
+
+ switch(qtype)
+ {
+ case 'C':
+ ProcessCommand(thrinfo->thr_conn, thrinfo->thr_gtm_conn,
+ &input_message);
+ break;
+
+ case 'X':
+ case EOF:
+ /*
+ * Connection termination request
+ *
+ * Close the socket and remember the connection
+ * as disconnected. All such connections will be
+ * removed after the command processing is over. We
+ * can't remove it just yet because we pass the slot id
+ * to the server to quickly find the backend connection
+ * while processing proxied messages.
+ */
+ GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+ break;
+ default:
+ /*
+ * Also disconnect if protocol error
+ */
+ GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+ elog(ERROR, "Unexpected message, or client disconnected abruptly.");
+ break;
+ }
+
+ }
+ }
+
+ /*
+ * Ok. All the commands are processed. Commands which can be proxied
+ * directly have been already sent to the GTM server. Now, group the
+ * remaining commands, send them to the server and flush the data.
+ */
+ GTMProxy_ProcessPendingCommands(thrinfo);
+
+ /*
+ * Add a special marker to tell the GTM server that we are done with
+ * one round of messages and the GTM server should flush all the
+ * pending responses after seeing this message.
+ */
+ if (gtmpqPutMsgStart('F', true, thrinfo->thr_gtm_conn) ||
+ gtmpqPutInt(MSG_DATA_FLUSH, sizeof (GTM_MessageType), thrinfo->thr_gtm_conn) ||
+ gtmpqPutMsgEnd(thrinfo->thr_gtm_conn))
+ elog(ERROR, "Error sending flush message");
+
+ /*
+ * Make sure everything is on wire now
+ */
+ gtmpqFlush(thrinfo->thr_gtm_conn);
+
+ /*
+ * Read back the responses and put them on to the right backend
+ * connection.
+ */
+ foreach(elem, thrinfo->thr_processed_commands)
+ {
+ GTMProxy_CommandInfo *cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+
+ /*
+ * If this is a continuation of a multi-part command response, we
+ * don't need to read another result from the stream. The previous
+ * result contains our response and we should just read from it.
+ */
+ if (cmdinfo->ci_res_index == 0)
+ {
+ if ((res = GTMPQgetResult(thrinfo->thr_gtm_conn)) == NULL)
+ elog(ERROR, "GTMPQgetResult failed");
+ }
+
+ ProcessResponse(thrinfo, cmdinfo, res);
+ }
+
+ list_free_deep(thrinfo->thr_processed_commands);
+ thrinfo->thr_processed_commands = NIL;
+
+ /*
+ * Now clean up disconnected connections
+ */
+ for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+ {
+ GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+ if (conninfo->con_disconnected)
+ {
+ GTMProxy_ThreadRemoveConnection(thrinfo, conninfo);
+ pfree(conninfo);
+ ii--;
+ }
+ }
+ }
+
+ /* can't get here because the above loop never exits */
+ Assert(false);
+
+ return thrinfo;
+}
+
+/*
+ * Add the accepted connection to the pool
+ */
+static int
+GTMProxyAddConnection(Port *port)
+{
+ GTMProxy_ConnectionInfo *conninfo = NULL;
+
+ conninfo = (GTMProxy_ConnectionInfo *)palloc0(sizeof (GTMProxy_ConnectionInfo));
+
+ if (conninfo == NULL)
+ {
+ ereport(ERROR,
+ (ENOMEM,
+ errmsg("Out of memory")));
+ return STATUS_ERROR;
+ }
+
+ elog(DEBUG3, "Started new connection");
+ conninfo->con_port = port;
+
+ /*
+ * Add the conninfo struct to the next worker thread in round-robin manner
+ */
+ GTMProxy_ThreadAddConnection(conninfo);
+
+ return STATUS_OK;
+}
+
+void
+ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ StringInfo input_message)
+{
+ GTM_MessageType mtype;
+
+ mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType));
+
+ switch (mtype)
+ {
+ case MSG_UNREGISTER_COORD:
+ ProcessCoordinatorCommand(conninfo, gtm_conn, mtype, input_message);
+ break;
+
+ case MSG_TXN_BEGIN:
+ case MSG_TXN_BEGIN_GETGXID:
+ case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+ case MSG_TXN_PREPARE:
+ case MSG_TXN_COMMIT:
+ case MSG_TXN_ROLLBACK:
+ case MSG_TXN_GET_GXID:
+ ProcessTransactionCommand(conninfo, gtm_conn, mtype, input_message);
+ break;
+
+ case MSG_SNAPSHOT_GET:
+ case MSG_SNAPSHOT_GXID_GET:
+ ProcessSnapshotCommand(conninfo, gtm_conn, mtype, input_message);
+ break;
+
+ case MSG_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
+ case MSG_SEQUENCE_GET_NEXT:
+ case MSG_SEQUENCE_RESET:
+ case MSG_SEQUENCE_CLOSE:
+ ProcessSeqeunceCommand(conninfo, gtm_conn, mtype, input_message);
+ break;
+
+ default:
+ ereport(FATAL,
+ (EPROTO,
+ errmsg("invalid frontend message type %d",
+ mtype)));
+ }
+
+ conninfo->con_pending_msg = mtype;
+}
+
+static void
+ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo,
+ GTM_Result *res)
+{
+ StringInfoData buf;
+ GlobalTransactionId gxid;
+
+ switch (cmdinfo->ci_mtype)
+ {
+ case MSG_TXN_BEGIN_GETGXID:
+ /*
+ * This is a grouped command. We send just the transaction count to
+ * the GTM server which responds back with the start GXID. We
+ * derive our GXID from the start GXID and the our position in the
+ * command queue
+ */
+ if (res->gr_status == 0)
+ {
+ if (res->gr_type != TXN_BEGIN_GETGXID_MULTI_RESULT)
+ elog(ERROR, "Wrong result");
+ if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_get_multi.txn_count)
+ elog(ERROR, "Too few GXIDs");
+
+ gxid = res->gr_resdata.grd_txn_get_multi.start_gxid + cmdinfo->ci_res_index;
+
+ /* Handle wraparound */
+ if (gxid < res->gr_resdata.grd_txn_get_multi.start_gxid)
+ gxid += FirstNormalGlobalTransactionId;
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4);
+ pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId));
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ }
+ else
+ {
+ pq_beginmessage(&buf, 'E');
+ pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ }
+ cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+ break;
+
+ case MSG_TXN_COMMIT:
+ if (res->gr_type != TXN_COMMIT_MULTI_RESULT)
+ elog(ERROR, "Wrong result");
+ /*
+ * These are grouped messages. We send an array of GXIDs to commit
+ * or rollback and the server sends us back an array of status
+ * codes.
+ */
+ if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count)
+ elog(ERROR, "Too few GXIDs");
+
+ if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+ {
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_COMMIT_RESULT, 4);
+ pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId));
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ }
+ else
+ ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed")));
+ cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+ break;
+
+ case MSG_TXN_ROLLBACK:
+ if (res->gr_type != TXN_ROLLBACK_MULTI_RESULT)
+ elog(ERROR, "Wrong result");
+ /*
+ * These are grouped messages. We send an array of GXIDs to commit
+ * or rollback and the server sends us back an array of status
+ * codes.
+ */
+ if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count)
+ elog(ERROR, "Too few GXIDs");
+
+ if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+ {
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4);
+ pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId));
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ }
+ else
+ ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed")));
+ cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+ break;
+
+ case MSG_SNAPSHOT_GET:
+ if ((res->gr_type != SNAPSHOT_GET_RESULT) &&
+ (res->gr_type != SNAPSHOT_GET_MULTI_RESULT))
+ elog(ERROR, "Wrong result");
+
+ if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_snap_multi.txn_count)
+ elog(ERROR, "Too few GXIDs");
+
+ if (res->gr_resdata.grd_txn_snap_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+ {
+ int txn_count = 1;
+ int status = STATUS_OK;
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SNAPSHOT_GET_RESULT, 4);
+ pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_snap.gxid, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&txn_count, sizeof (txn_count));
+ pq_sendbytes(&buf, (char *)&status, sizeof (status));
+ pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmin, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmax, sizeof (GlobalTransactionId));
+ pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_recent_global_xmin, sizeof (GlobalTransactionId));
+ pq_sendint(&buf, res->gr_snapshot.sn_xcnt, sizeof (int));
+ pq_sendbytes(&buf, (char *)res->gr_snapshot.sn_xip,
+ sizeof(GlobalTransactionId) * res->gr_snapshot.sn_xcnt);
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ }
+ else
+ ereport(ERROR2, (EINVAL, errmsg("snapshot request failed")));
+ cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+ break;
+
+ case MSG_TXN_BEGIN:
+ case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+ case MSG_TXN_PREPARE:
+ case MSG_TXN_GET_GXID:
+ case MSG_SNAPSHOT_GXID_GET:
+ case MSG_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
+ case MSG_SEQUENCE_GET_NEXT:
+ case MSG_SEQUENCE_RESET:
+ case MSG_SEQUENCE_CLOSE:
+ if ((res->gr_proxyhdr.ph_conid == InvalidGTMProxyConnID) ||
+ (res->gr_proxyhdr.ph_conid >= GTM_PROXY_MAX_CONNECTIONS) ||
+ (thrinfo->thr_all_conns[res->gr_proxyhdr.ph_conid] != cmdinfo->ci_conn))
+ elog(PANIC, "Invalid response or synchronization loss");
+
+ /*
+ * These are just proxied messages.. so just forward the response
+ * back after stripping the conid part.
+ *
+ * !!TODO As we start adding support for message grouping for
+ * messages, those message types would be removed from the above
+ * and handled separately.
+ */
+ switch (res->gr_status)
+ {
+ case 0:
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, res->gr_type, 4);
+ pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ break;
+
+ default:
+ pq_beginmessage(&buf, 'E');
+ pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+ pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+ pq_flush(cmdinfo->ci_conn->con_port);
+ break;
+ }
+ cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+ break;
+
+ default:
+ ereport(FATAL,
+ (EPROTO,
+ errmsg("invalid frontend message type %d",
+ cmdinfo->ci_mtype)));
+ }
+
+
+}
+
+/* ----------------
+ * ReadCommand reads a command from either the frontend or
+ * standard input, places it in inBuf, and returns the
+ * message type code (first byte of the message).
+ * EOF is returned if end of file.
+ * ----------------
+ */
+static int
+ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf)
+{
+ int qtype;
+
+ /*
+ * Get message type code from the frontend.
+ */
+ qtype = pq_getbyte(conninfo->con_port);
+
+ if (qtype == EOF) /* frontend disconnected */
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("unexpected EOF on client connection")));
+ return qtype;
+ }
+
+ /*
+ * Validate message type code before trying to read body; if we have lost
+ * sync, better to say "command unknown" than to run out of memory because
+ * we used garbage as a length word.
+ *
+ * This also gives us a place to set the doing_extended_query_message flag
+ * as soon as possible.
+ */
+ switch (qtype)
+ {
+ case 'C':
+ break;
+
+ case 'X':
+ break;
+
+ default:
+
+ /*
+ * Otherwise we got garbage from the frontend. We treat this as
+ * fatal because we have probably lost message boundary sync, and
+ * there's no good way to recover.
+ */
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("invalid frontend message type %d", qtype)));
+
+ break;
+ }
+
+ /*
+ * In protocol version 3, all frontend messages have a length word next
+ * after the type code; we can read the message contents independently of
+ * the type.
+ */
+ if (pq_getmessage(conninfo->con_port, inBuf, 0))
+ return EOF; /* suitable message already logged */
+
+ return qtype;
+}
+
+static void
+ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ GTM_MessageType mtype, StringInfo message)
+{
+ GTM_CoordinatorId cid;
+
+ cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId));
+
+ switch (mtype)
+ {
+ case MSG_UNREGISTER_COORD:
+ GTMProxy_UnregisterCoordinator(conninfo, cid);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+ pq_getmsgend(message);
+}
+
+static void
+ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ GTM_MessageType mtype, StringInfo message)
+{
+ GTMProxy_CommandData cmd_data;
+
+ switch (mtype)
+ {
+ case MSG_TXN_BEGIN_GETGXID:
+ cmd_data.cd_beg.iso_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+ cmd_data.cd_beg.rdonly = pq_getmsgbyte(message);
+ GTMProxy_CommandPending(conninfo, mtype, cmd_data);
+ break;
+
+ case MSG_TXN_COMMIT:
+ case MSG_TXN_ROLLBACK:
+ cmd_data.cd_rc.isgxid = pq_getmsgbyte(message);
+ if (cmd_data.cd_rc.isgxid)
+ {
+ const char *data = pq_getmsgbytes(message,
+ sizeof (GlobalTransactionId));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&cmd_data.cd_rc.gxid, data, sizeof (GlobalTransactionId));
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message,
+ sizeof (GTM_TransactionHandle));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&cmd_data.cd_rc.handle, data, sizeof (GTM_TransactionHandle));
+ }
+ pq_getmsgend(message);
+ GTMProxy_CommandPending(conninfo, mtype, cmd_data);
+ break;
+
+ case MSG_TXN_BEGIN:
+ case MSG_TXN_GET_GXID:
+ elog(FATAL, "Support not yet added for these message types");
+ break;
+
+ case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+ case MSG_TXN_PREPARE:
+ GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+}
+
+static void
+ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ GTM_MessageType mtype, StringInfo message)
+{
+ bool canbe_grouped = false;
+ GTMProxy_CommandData cmd_data;
+
+ switch (mtype)
+ {
+ case MSG_SNAPSHOT_GET:
+ canbe_grouped = pq_getmsgbyte(message);
+ if (!canbe_grouped)
+ GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+ else
+ {
+ cmd_data.cd_snap.isgxid = pq_getmsgbyte(message);
+ if (cmd_data.cd_snap.isgxid)
+ {
+ const char *data = pq_getmsgbytes(message,
+ sizeof (GlobalTransactionId));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid GXID")));
+ memcpy(&cmd_data.cd_snap.gxid, data, sizeof (GlobalTransactionId));
+ }
+ else
+ {
+ const char *data = pq_getmsgbytes(message,
+ sizeof (GTM_TransactionHandle));
+ if (data == NULL)
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Message does not contain valid Transaction Handle")));
+ memcpy(&cmd_data.cd_snap.handle, data, sizeof (GTM_TransactionHandle));
+ }
+ pq_getmsgend(message);
+ GTMProxy_CommandPending(conninfo, mtype, cmd_data);
+ }
+ break;
+
+ case MSG_SNAPSHOT_GXID_GET:
+ elog(ERROR, "Message not yet support");
+ break;
+
+ default:
+ Assert(0); /* Shouldn't come here.. keep compiler quite */
+ }
+
+}
+
+static void
+ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ GTM_MessageType mtype, StringInfo message)
+{
+ /*
+ * We proxy the Sequence messages as they are. Just add the connection
+ * identifier to it so that the response can be quickly sent back to the
+ * right backend.
+ *
+ * Write the message, but don't flush it just yet.
+ */
+ return GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+}
+
+/*
+ * Proxy the incoming message to the GTM server after adding our own identifier
+ * to it. The rest of the message is forwarded as it is without even reading
+ * its contents.
+ */
+static void
+GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+ GTM_MessageType mtype, StringInfo message)
+{
+ GTMProxy_CommandInfo *cmdinfo;
+ GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo;
+ GTM_ProxyMsgHeader proxyhdr;
+
+ proxyhdr.ph_conid = conninfo->con_id;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+ gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) ||
+ gtmpqPutInt(mtype, sizeof (GTM_MessageType), gtm_conn) ||
+ gtmpqPutnchar(pq_getmsgbytes(message, pq_getmsgunreadlen(message)),
+ pq_getmsgunreadlen(message), gtm_conn))
+ elog(ERROR, "Error proxing data");
+
+ /*
+ * Add the message to the pending command list
+ */
+ cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo));
+ cmdinfo->ci_mtype = mtype;
+ cmdinfo->ci_conn = conninfo;
+ cmdinfo->ci_res_index = 0;
+ thrinfo->thr_processed_commands = lappend(thrinfo->thr_processed_commands, cmdinfo);
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+ return;
+}
+
+
+/*
+ * Record the incoming message as per its type. After all messages of this type
+ * are collected, they will be sent in a single message to the GTM server.
+ */
+static void
+GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype,
+ GTMProxy_CommandData cmd_data)
+{
+ GTMProxy_CommandInfo *cmdinfo;
+ GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo;
+
+ /*
+ * Add the message to the pending command list
+ */
+ cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo));
+ cmdinfo->ci_mtype = mtype;
+ cmdinfo->ci_conn = conninfo;
+ cmdinfo->ci_res_index = 0;
+ cmdinfo->ci_data = cmd_data;
+ thrinfo->thr_pending_commands[mtype] = lappend(thrinfo->thr_pending_commands[mtype], cmdinfo);
+
+ return;
+}
+static void
+GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid)
+{
+ elog(DEBUG3, "Registering coordinator with cid %d", cid);
+ conninfo->con_port->coordinator_id = cid;
+}
+
+
+static void
+GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid)
+{
+ /*
+ * Do a clean shutdown
+ */
+ return;
+}
+
+
+static void
+GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo)
+{
+ /*
+ * We expect a startup message at the very start. The message type is
+ * REGISTER_COORD, followed by the 4 byte coordinator ID
+ */
+ char startup_type;
+ GTM_StartupPacket sp;
+ StringInfoData inBuf;
+ StringInfoData buf;
+
+ startup_type = pq_getbyte(conninfo->con_port);
+
+ if (startup_type != 'A')
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Expecting a startup message, but received %c",
+ startup_type)));
+
+ initStringInfo(&inBuf);
+
+ /*
+ * All frontend messages have a length word next
+ * after the type code; we can read the message contents independently of
+ * the type.
+ */
+ if (pq_getmessage(conninfo->con_port, &inBuf, 0))
+ ereport(ERROR,
+ (EPROTO,
+ errmsg("Expecting coordinator ID, but received EOF")));
+
+ memcpy(&sp,
+ pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)),
+ sizeof (GTM_StartupPacket));
+ pq_getmsgend(&inBuf);
+
+ GTMProxy_RegisterCoordinator(conninfo, sp.sp_cid);
+
+ /*
+ * Send a dummy authentication request message 'R' as the client
+ * expects that in the current protocol
+ */
+ pq_beginmessage(&buf, 'R');
+ pq_endmessage(conninfo->con_port, &buf);
+ pq_flush(conninfo->con_port);
+
+ conninfo->con_authenticated = true;
+
+ elog(DEBUG3, "Sent connection authentication message to the client");
+}
+
+static void
+GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn)
+{
+ GTM_ProxyMsgHeader proxyhdr;
+
+ conninfo->con_disconnected = true;
+ if (conninfo->con_port->sock > 0)
+ StreamClose(conninfo->con_port->sock);
+ ConnFree(conninfo->con_port);
+ conninfo->con_port = NULL;
+
+ proxyhdr.ph_conid = conninfo->con_id;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+ gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) ||
+ gtmpqPutInt(MSG_BACKEND_DISCONNECT, sizeof (GTM_MessageType), gtm_conn))
+ elog(ERROR, "Error proxing data");
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+ return;
+}
+
+/*
+ * Proces all the pending messages now.
+ */
+static void
+GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo)
+{
+ int ii;
+ GTMProxy_CommandInfo *cmdinfo = NULL;
+ GTM_ProxyMsgHeader proxyhdr;
+ GTM_Conn *gtm_conn = thrinfo->thr_gtm_conn;
+ ListCell *elem = NULL;
+
+ for (ii = 0; ii < MSG_TYPE_COUNT; ii++)
+ {
+ int res_index = 0;
+
+ if (list_length(thrinfo->thr_pending_commands[ii]) == 0)
+ continue;
+
+ /*
+ * Start a new group message and fill in the headers
+ */
+ proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+ if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+ gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn))
+ elog(ERROR, "Error proxing data");
+
+ switch (ii)
+ {
+ case MSG_TXN_BEGIN_GETGXID:
+ if (list_length(thrinfo->thr_pending_commands[ii]) <=0 )
+ elog(PANIC, "No pending commands of type %d", ii);
+
+ if (gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+ gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+ elog(ERROR, "Error sending data");
+ foreach (elem, thrinfo->thr_pending_commands[ii])
+ {
+ cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+ Assert(cmdinfo->ci_mtype == ii);
+ cmdinfo->ci_res_index = res_index++;
+ if (gtmpqPutInt(cmdinfo->ci_data.cd_beg.iso_level,
+ sizeof (GTM_IsolationLevel), gtm_conn) ||
+ gtmpqPutc(cmdinfo->ci_data.cd_beg.rdonly, gtm_conn) ||
+ gtmpqPutInt(cmdinfo->ci_conn->con_id, sizeof (GTMProxy_ConnID), gtm_conn))
+ elog(ERROR, "Error sending data");
+
+ }
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+ /*
+ * Move the entire list to the processed command
+ */
+ thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+ thrinfo->thr_pending_commands[ii]);
+ thrinfo->thr_pending_commands[ii] = NIL;
+ break;
+
+ case MSG_TXN_COMMIT:
+ if (gtmpqPutInt(MSG_TXN_COMMIT_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+ gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+ elog(ERROR, "Error sending data");
+
+ foreach (elem, thrinfo->thr_pending_commands[ii])
+ {
+ cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+ Assert(cmdinfo->ci_mtype == ii);
+ cmdinfo->ci_res_index = res_index++;
+ if (cmdinfo->ci_data.cd_rc.isgxid)
+ {
+ if (gtmpqPutc(true, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+ sizeof (GlobalTransactionId), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ else
+ {
+ if (gtmpqPutc(false, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+ sizeof (GTM_TransactionHandle), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ }
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+ /*
+ * Move the entire list to the processed command
+ */
+ thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+ thrinfo->thr_pending_commands[ii]);
+ thrinfo->thr_pending_commands[ii] = NIL;
+ break;
+
+ break;
+
+ case MSG_TXN_ROLLBACK:
+ if (gtmpqPutInt(MSG_TXN_ROLLBACK_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+ gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+ elog(ERROR, "Error sending data");
+
+ foreach (elem, thrinfo->thr_pending_commands[ii])
+ {
+ cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+ Assert(cmdinfo->ci_mtype == ii);
+ cmdinfo->ci_res_index = res_index++;
+ if (cmdinfo->ci_data.cd_rc.isgxid)
+ {
+ if (gtmpqPutc(true, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+ sizeof (GlobalTransactionId), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ else
+ {
+ if (gtmpqPutc(false, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+ sizeof (GTM_TransactionHandle), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ }
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+
+ /*
+ * Move the entire list to the processed command
+ */
+ thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+ thrinfo->thr_pending_commands[ii]);
+ thrinfo->thr_pending_commands[ii] = NIL;
+ break;
+
+ case MSG_SNAPSHOT_GET:
+ if (gtmpqPutInt(MSG_SNAPSHOT_GET_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+ gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+ elog(ERROR, "Error sending data");
+
+ foreach (elem, thrinfo->thr_pending_commands[ii])
+ {
+ cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+ Assert(cmdinfo->ci_mtype == ii);
+ cmdinfo->ci_res_index = res_index++;
+ if (cmdinfo->ci_data.cd_rc.isgxid)
+ {
+ if (gtmpqPutc(true, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+ sizeof (GlobalTransactionId), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ else
+ {
+ if (gtmpqPutc(false, gtm_conn) ||
+ gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+ sizeof (GTM_TransactionHandle), gtm_conn))
+ elog(ERROR, "Error sending data");
+ }
+ }
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(gtm_conn))
+ elog(ERROR, "Error finishing the message");
+
+ /*
+ * Move the entire list to the processed command
+ */
+ thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+ thrinfo->thr_pending_commands[ii]);
+ thrinfo->thr_pending_commands[ii] = NIL;
+ break;
+
+
+ default:
+ elog(ERROR, "This message type (%d) can not be grouped together", ii);
+ }
+
+ }
+}
+
+/*
+ * Validate the proposed data directory
+ */
+static void
+checkDataDir(void)
+{
+ struct stat stat_buf;
+
+ Assert(GTMProxyDataDir);
+
+retry:
+ if (stat(GTMProxyDataDir, &stat_buf) != 0)
+ {
+ if (errno == ENOENT)
+ {
+ if (mkdir(GTMProxyDataDir, 0700) != 0)
+ {
+ ereport(FATAL,
+ (errno,
+ errmsg("failed to create the directory \"%s\"",
+ GTMProxyDataDir)));
+ }
+ goto retry;
+ }
+ else
+ ereport(FATAL,
+ (EPERM,
+ errmsg("could not read permissions of directory \"%s\": %m",
+ GTMProxyDataDir)));
+ }
+
+ /* eventual chdir would fail anyway, but let's test ... */
+ if (!S_ISDIR(stat_buf.st_mode))
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("specified data directory \"%s\" is not a directory",
+ GTMProxyDataDir)));
+
+ /*
+ * Check that the directory belongs to my userid; if not, reject.
+ *
+ * This check is an essential part of the interlock that prevents two
+ * postmasters from starting in the same directory (see CreateLockFile()).
+ * Do not remove or weaken it.
+ *
+ * XXX can we safely enable this check on Windows?
+ */
+#if !defined(WIN32) && !defined(__CYGWIN__)
+ if (stat_buf.st_uid != geteuid())
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("data directory \"%s\" has wrong ownership",
+ GTMProxyDataDir),
+ errhint("The server must be started by the user that owns the data directory.")));
+#endif
+}
+
+/*
+ * Change working directory to DataDir. Most of the postmaster and backend
+ * code assumes that we are in DataDir so it can use relative paths to access
+ * stuff in and under the data directory. For convenience during path
+ * setup, however, we don't force the chdir to occur during SetDataDir.
+ */
+static void
+ChangeToDataDir(void)
+{
+ if (chdir(GTMProxyDataDir) < 0)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not change directory to \"%s\": %m",
+ GTMProxyDataDir)));
+}
+
+/*
+ * Create the data directory lockfile.
+ *
+ * When this is called, we must have already switched the working
+ * directory to DataDir, so we can just use a relative path. This
+ * helps ensure that we are locking the directory we should be.
+ */
+static void
+CreateDataDirLockFile()
+{
+ CreateLockFile(GTM_PID_FILE, GTMProxyDataDir);
+}
+
+/*
+ * Create a lockfile.
+ *
+ * filename is the name of the lockfile to create.
+ * amPostmaster is used to determine how to encode the output PID.
+ * isDDLock and refName are used to determine what error message to produce.
+ */
+static void
+CreateLockFile(const char *filename, const char *refName)
+{
+ int fd;
+ char buffer[MAXPGPATH + 100];
+ int ntries;
+ int len;
+ int encoded_pid;
+ pid_t other_pid;
+ pid_t my_pid = getpid();
+
+ /*
+ * We need a loop here because of race conditions. But don't loop forever
+ * (for example, a non-writable $PGDATA directory might cause a failure
+ * that won't go away). 100 tries seems like plenty.
+ */
+ for (ntries = 0;; ntries++)
+ {
+ /*
+ * Try to create the lock file --- O_EXCL makes this atomic.
+ *
+ * Think not to make the file protection weaker than 0600. See
+ * comments below.
+ */
+ fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600);
+ if (fd >= 0)
+ break; /* Success; exit the retry loop */
+
+ /*
+ * Couldn't create the pid file. Probably it already exists.
+ */
+ if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not create lock file \"%s\": %m",
+ filename)));
+
+ /*
+ * Read the file to get the old owner's PID. Note race condition
+ * here: file might have been deleted since we tried to create it.
+ */
+ fd = open(filename, O_RDONLY, 0600);
+ if (fd < 0)
+ {
+ if (errno == ENOENT)
+ continue; /* race condition; try again */
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not open lock file \"%s\": %m",
+ filename)));
+ }
+ if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("could not read lock file \"%s\": %m",
+ filename)));
+ close(fd);
+
+ buffer[len] = '\0';
+ encoded_pid = atoi(buffer);
+ other_pid = (pid_t) encoded_pid;
+
+ if (other_pid <= 0)
+ elog(FATAL, "bogus data in lock file \"%s\": \"%s\"",
+ filename, buffer);
+
+ /*
+ * Check to see if the other process still exists
+ *
+ * If the PID in the lockfile is our own PID or our parent's PID, then
+ * the file must be stale (probably left over from a previous system
+ * boot cycle). We need this test because of the likelihood that a
+ * reboot will assign exactly the same PID as we had in the previous
+ * reboot. Also, if there is just one more process launch in this
+ * reboot than in the previous one, the lockfile might mention our
+ * parent's PID. We can reject that since we'd never be launched
+ * directly by a competing postmaster. We can't detect grandparent
+ * processes unfortunately, but if the init script is written
+ * carefully then all but the immediate parent shell will be
+ * root-owned processes and so the kill test will fail with EPERM.
+ *
+ * We can treat the EPERM-error case as okay because that error
+ * implies that the existing process has a different userid than we
+ * do, which means it cannot be a competing postmaster. A postmaster
+ * cannot successfully attach to a data directory owned by a userid
+ * other than its own. (This is now checked directly in
+ * checkDataDir(), but has been true for a long time because of the
+ * restriction that the data directory isn't group- or
+ * world-accessible.) Also, since we create the lockfiles mode 600,
+ * we'd have failed above if the lockfile belonged to another userid
+ * --- which means that whatever process kill() is reporting about
+ * isn't the one that made the lockfile. (NOTE: this last
+ * consideration is the only one that keeps us from blowing away a
+ * Unix socket file belonging to an instance of Postgres being run by
+ * someone else, at least on machines where /tmp hasn't got a
+ * stickybit.)
+ *
+ * Windows hasn't got getppid(), but doesn't need it since it's not
+ * using real kill() either...
+ *
+ * Normally kill() will fail with ESRCH if the given PID doesn't
+ * exist.
+ */
+ if (other_pid != my_pid
+#ifndef WIN32
+ && other_pid != getppid()
+#endif
+ )
+ {
+ if (kill(other_pid, 0) == 0 ||
+ (errno != ESRCH && errno != EPERM))
+ {
+ /* lockfile belongs to a live process */
+ ereport(FATAL,
+ (EINVAL,
+ errmsg("lock file \"%s\" already exists",
+ filename),
+ errhint("Is another GTM proxy (PID %d) running in data directory \"%s\"?",
+ (int) other_pid, refName)));
+ }
+ }
+
+ /*
+ * Looks like nobody's home. Unlink the file and try again to create
+ * it. Need a loop because of possible race condition against other
+ * would-be creators.
+ */
+ if (unlink(filename) < 0)
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not remove old lock file \"%s\": %m",
+ filename),
+ errhint("The file seems accidentally left over, but "
+ "it could not be removed. Please remove the file "
+ "by hand and try again.")));
+ }
+
+ /*
+ * Successfully created the file, now fill it.
+ */
+ snprintf(buffer, sizeof(buffer), "%d\n%s\n",
+ (int) my_pid, GTMProxyDataDir);
+ errno = 0;
+ if (write(fd, buffer, strlen(buffer)) != strlen(buffer))
+ {
+ int save_errno = errno;
+
+ close(fd);
+ unlink(filename);
+ /* if write didn't set errno, assume problem is no disk space */
+ errno = save_errno ? save_errno : ENOSPC;
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not write lock file \"%s\": %m", filename)));
+ }
+ if (close(fd))
+ {
+ int save_errno = errno;
+
+ unlink(filename);
+ errno = save_errno;
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not write lock file \"%s\": %m", filename)));
+ }
+
+}
+
+/*
+ * Create the opts file
+ */
+static bool
+CreateOptsFile(int argc, char *argv[])
+{
+ FILE *fp;
+ int i;
+
+#define OPTS_FILE "gtm_proxy.opts"
+
+ if ((fp = fopen(OPTS_FILE, "w")) == NULL)
+ {
+ elog(LOG, "could not create file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ for (i = 1; i < argc; i++)
+ fprintf(fp, " \"%s\"", argv[i]);
+ fputs("\n", fp);
+
+ if (fclose(fp))
+ {
+ elog(LOG, "could not write file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ return true;
+}
+
+/* delete pid file */
+static void
+DeleteLockFile(const char *filename)
+{
+ if (unlink(filename) < 0)
+ ereport(FATAL,
+ (EACCES,
+ errmsg("could not remove old lock file \"%s\": %m",
+ filename),
+ errhint("The file seems accidentally left over, but "
+ "it could not be removed. Please remove the file "
+ "by hand and try again.")));
+}
diff --git a/src/gtm/proxy/proxy_thread.c b/src/gtm/proxy/proxy_thread.c
new file mode 100644
index 0000000000..844f2f70b4
--- /dev/null
+++ b/src/gtm/proxy/proxy_thread.c
@@ -0,0 +1,451 @@
+/*-------------------------------------------------------------------------
+ *
+ * proxy_thread.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <pthread.h>
+#include "gtm/gtm_proxy.h"
+#include "gtm/memutils.h"
+#include "gtm/libpq.h"
+
+static void *GTMProxy_ThreadMainWrapper(void *argp);
+static void GTMProxy_ThreadCleanup(void *argp);
+
+GTMProxy_Threads GTMProxyThreadsData;
+GTMProxy_Threads *GTMProxyThreads = &GTMProxyThreadsData;
+
+#define GTM_PROXY_MIN_THREADS 32 /* Provision for minimum threads */
+#define GTM_PROXY_MAX_THREADS 1024 /* Max threads allowed in the GTMProxy */
+#define GTMProxyThreadsFull (GTMProxyThreads->gt_thread_count == GTMProxyThreads->gt_array_size)
+
+/*
+ * Add the given thrinfo structure to the global array, expanding it if
+ * necessary
+ */
+int
+GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo)
+{
+ int ii;
+
+ GTM_RWLockAcquire(&GTMProxyThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+ if (GTMProxyThreadsFull)
+ {
+ GTMProxy_ThreadInfo **threads;
+ uint32 newsize;
+
+ /*
+ * TODO Optimize lock management by not holding any locks during memory
+ * allocation
+ */
+ if (GTMProxyThreads->gt_array_size == GTM_PROXY_MAX_THREADS)
+ elog(ERROR, "Too many threads active");
+
+ if (GTMProxyThreads->gt_array_size == 0)
+ newsize = GTM_PROXY_MIN_THREADS;
+ else
+ {
+ /*
+ * We ran out of the array size. Just double the size, bound by the
+ * upper limit
+ */
+ newsize = GTMProxyThreads->gt_array_size * 2;
+ }
+
+ /* Can't have more than GTM_PROXY_MAX_THREADS */
+ if (newsize > GTM_PROXY_MAX_THREADS)
+ newsize = GTM_PROXY_MAX_THREADS;
+
+ if (GTMProxyThreads->gt_threads == NULL)
+ threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize);
+ else
+ {
+ void *old_ptr = GTMProxyThreads->gt_threads;
+ threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize);
+ memcpy(threads, old_ptr,
+ GTMProxyThreads->gt_array_size * sizeof (GTMProxy_ThreadInfo *));
+ pfree(old_ptr);
+ }
+
+ GTMProxyThreads->gt_threads = threads;
+ GTMProxyThreads->gt_array_size = newsize;
+ }
+
+ /*
+ * Now that we have free entries in the array, find a free slot and add the
+ * thrinfo pointer to it.
+ *
+ * TODO Optimize this later by tracking few free slots and reusing them.
+ * The free slots can be updated when a thread exits and reused when a new
+ * thread is added to the pool.
+ */
+ for (ii = 0; ii < GTMProxyThreads->gt_array_size; ii++)
+ {
+ if (GTMProxyThreads->gt_threads[ii] == NULL)
+ {
+ GTMProxyThreads->gt_threads[ii] = thrinfo;
+ GTMProxyThreads->gt_thread_count++;
+ break;
+ }
+ }
+ GTM_RWLockRelease(&GTMProxyThreads->gt_lock);
+
+ /*
+ * Track the slot information in the thrinfo. This is useful to quickly
+ * find the slot given the thrinfo structure.
+ */
+ thrinfo->thr_localid = ii;
+ return ii;
+}
+
+int
+GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo)
+{
+ /*
+ * XXX To be implemeneted
+ */
+ return 0;
+}
+
+/*
+ * Create a new thread and assign the given connection to it.
+ *
+ * This function is responsible for setting up the various memory contextes for
+ * the thread as well as registering this thread with the Thread Manager.
+ *
+ * Upon successful creation, the thread will start running the given
+ * "startroutine". The thread information is returned to the calling process.
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_ThreadCreate(void *(* startroutine)(void *))
+{
+ GTMProxy_ThreadInfo *thrinfo;
+ int err;
+
+ /*
+ * We are still running in the context of the main thread. So the
+ * allocation below would last as long as the main thread exists or the
+ * memory is explicitely freed.
+ */
+ thrinfo = (GTMProxy_ThreadInfo *)palloc0(sizeof (GTMProxy_ThreadInfo));
+
+ GTM_MutexLockInit(&thrinfo->thr_lock);
+ GTM_CVInit(&thrinfo->thr_cv);
+
+ /*
+ * The thread status is set to GTM_PROXY_THREAD_STARTING and will be changed by
+ * the thread itself when it actually starts executing
+ */
+ thrinfo->thr_status = GTM_PROXY_THREAD_STARTING;
+
+ /*
+ * Install the ThreadInfo structure in the global array. We do this before
+ * starting the thread
+ */
+ if (GTMProxy_ThreadAdd(thrinfo) == -1)
+ elog(ERROR, "Error starting a new thread");
+
+ /*
+ * Set up memory contextes before actually starting the threads
+ *
+ * The TopThreadContext is a child of TopMemoryContext and it will last as
+ * long as the main process or this thread lives
+ *
+ * Thread context is not shared between other threads
+ */
+ thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext,
+ "TopMemoryContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE,
+ false);
+
+ /*
+ * Since the thread is not yes started, TopMemoryContext still points to
+ * the context of the calling thread
+ */
+ thrinfo->thr_parent_context = TopMemoryContext;
+
+ /*
+ * Each thread gets its own ErrorContext and its a child of ErrorContext of
+ * the main process
+ *
+ * This is a thread-specific context and is not shared between other
+ * threads
+ */
+ thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
+ "ErrorContext",
+ 8 * 1024,
+ 8 * 1024,
+ 8 * 1024,
+ false);
+
+ thrinfo->thr_startroutine = startroutine;
+
+ /*
+ * Now start the thread. The thread will start executing the given
+ * "startroutine". The thrinfo structure is also passed to the thread. Any
+ * additional parameters should be passed via the thrinfo strcuture.
+ *
+ * Return the thrinfo structure to the caller
+ */
+ if ((err = pthread_create(&thrinfo->thr_id, NULL, GTMProxy_ThreadMainWrapper,
+ thrinfo)))
+ ereport(ERROR,
+ (err,
+ errmsg("Failed to create a new thread: error %d", err)));
+
+ return thrinfo;
+}
+
+/*
+ * Exit the current thread
+ */
+void
+GTMProxy_ThreadExit(void)
+{
+ /* XXX To be implemented */
+}
+
+int
+GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo)
+{
+ int error;
+ void *data;
+
+ error = pthread_join(thrinfo->thr_id, &data);
+
+ return error;
+}
+
+/*
+ * Get thread information for the given thread, identified by the
+ * thread_id
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_GetThreadInfo(GTM_ThreadID thrid)
+{
+
+ return NULL;
+}
+
+/*
+ * Cleanup routine for the thread
+ */
+static void
+GTMProxy_ThreadCleanup(void *argp)
+{
+ GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+
+ elog(LOG, "Cleaning up thread state");
+
+ /*
+ * TODO Close the open connection.
+ */
+ StreamClose(thrinfo->thr_conn->con_port->sock);
+
+ /*
+ * Switch to the memory context of the main process so that we can free up
+ * our memory contextes easily.
+ *
+ * XXX We don't setup cleanup handlers for the main process. So this
+ * routine would never be called for the main process/thread
+ */
+ MemoryContextSwitchTo(thrinfo->thr_parent_context);
+
+ MemoryContextDelete(thrinfo->thr_message_context);
+ thrinfo->thr_message_context = NULL;
+
+ MemoryContextDelete(thrinfo->thr_error_context);
+ thrinfo->thr_error_context = NULL;
+
+ MemoryContextDelete(thrinfo->thr_thread_context);
+ thrinfo->thr_thread_context = NULL;
+
+ /*
+ * TODO Now cleanup the thrinfo structure itself and remove it from the global
+ * array.
+ */
+
+
+ /*
+ * Reset the thread-specific information. This should be done only after we
+ * are sure that memory contextes are not required
+ *
+ * Note: elog calls need memory contextes, so no elog calls beyond this
+ * point.
+ */
+ SetMyThreadInfo(NULL);
+
+ return;
+}
+
+/*
+ * A wrapper around the start routine of the thread. This helps us doing any
+ * initialization and setting up cleanup handlers before the main routine is
+ * started
+ */
+void *
+GTMProxy_ThreadMainWrapper(void *argp)
+{
+ GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+
+ pthread_detach(thrinfo->thr_id);
+
+ SetMyThreadInfo(thrinfo);
+ MemoryContextSwitchTo(TopMemoryContext);
+
+ pthread_cleanup_push(GTMProxy_ThreadCleanup, thrinfo);
+ thrinfo->thr_startroutine(thrinfo);
+ pthread_cleanup_pop(1);
+
+ return thrinfo;
+}
+
+/*
+ * Add the given connection info structure to a thread which is selected by a
+ * round-robin manner. The caller is responsible for only accepting the
+ * connection. Other things including the authentication is done by the worker
+ * thread when it finds a new entry in the connection list.
+ *
+ * Return the reference to the GTMProxy_ThreadInfo structure of the thread
+ * which will be serving this connection
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo)
+{
+ GTMProxy_ThreadInfo *thrinfo = NULL;
+
+ /*
+ * Get the next thread in the queue
+ */
+ GTM_RWLockAcquire(&GTMProxyThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+ /*
+ * Always start with thread 1 because thread 0 is the main thread
+ */
+ if (GTMProxyThreads->gt_next_worker == 0)
+ GTMProxyThreads->gt_next_worker = 1;
+
+ thrinfo = GTMProxyThreads->gt_threads[GTMProxyThreads->gt_next_worker];
+
+ /*
+ * Set the next worker thread before releasing the lock
+ */
+ GTMProxyThreads->gt_next_worker++;
+ if (GTMProxyThreads->gt_next_worker == GTMProxyThreads->gt_thread_count)
+ GTMProxyThreads->gt_next_worker = 1;
+
+ GTM_RWLockRelease(&GTMProxyThreads->gt_lock);
+
+ /*
+ * Lock the threadninfo structure to safely add the new connection to the
+ * thread structure. The thread will see the connection when it queries the
+ * socket descriptor in the next cycle
+ */
+ GTM_MutexLockAcquire(&thrinfo->thr_lock);
+
+ if (thrinfo->thr_conn_count >= GTM_PROXY_MAX_CONNECTIONS)
+ {
+ GTM_MutexLockRelease(&thrinfo->thr_lock);
+ elog(ERROR, "Too many connections");
+ }
+
+ /*
+ * Save the array slotid in the conninfo structure. We send this to the GTM
+ * server as an identifier which the GTM server sends us back in the
+ * response. We use that information to route the response back to the
+ * approrpiate connection
+ */
+ conninfo->con_id = thrinfo->thr_conn_count;
+ thrinfo->thr_all_conns[thrinfo->thr_conn_count] = conninfo;
+ thrinfo->thr_conn_count++;
+
+ /*
+ * Now increment the seqno since a new connection is added to the array.
+ * Before we do the next poll(), the fd array will be forced to be
+ * reconstructed.
+ */
+ thrinfo->thr_seqno++;
+
+ /*
+ * Signal the worker thread if its waiting for connections to be added to
+ * its Q
+ *
+ * XXX May be we can first check the condition that this is the first
+ * connection in the array and also use signal instead of a bcast since
+ * only one thread is waiting on the cv.
+ */
+ GTM_CVBcast(&thrinfo->thr_cv);
+ GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+ return thrinfo;
+}
+
+/*
+ * Remove the connection from the array and compact the array
+ */
+int
+GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo, GTMProxy_ConnectionInfo *conninfo)
+{
+ int ii;
+
+ /*
+ * Lock the threadninfo structure to safely remove the connection from the
+ * thread structure.
+ */
+ GTM_MutexLockAcquire(&thrinfo->thr_lock);
+
+ for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+ {
+ if (thrinfo->thr_all_conns[ii] == conninfo)
+ break;
+ }
+
+ if (ii >= thrinfo->thr_conn_count)
+ {
+ GTM_MutexLockRelease(&thrinfo->thr_lock);
+ elog(ERROR, "No such connection");
+ }
+
+ /*
+ * If this is the last entry in the array ? If not, then copy the last
+ * entry in this slot and mark the last slot an empty
+ */
+ if ((ii + 1) < thrinfo->thr_conn_count)
+ {
+ /* Copy the last entry in this slot */
+ thrinfo->thr_all_conns[ii] = thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1];
+
+ /* Mark the last slot free */
+ thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1] = NULL;
+
+ /* Adjust the con_id to reflect the current slot in the array */
+ thrinfo->thr_all_conns[ii]->con_id = ii;
+ }
+ else
+ {
+ /* This is the last entry in the array. Just mark it free */
+ thrinfo->thr_all_conns[ii] = NULL;
+ }
+
+ thrinfo->thr_conn_count--;
+
+ /*
+ * Increment the seqno to ensure that the next time before we poll, the fd
+ * array is reconstructed.
+ */
+ thrinfo->thr_seqno++;
+ GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+ return 0;
+}
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
new file mode 100644
index 0000000000..66ca3f12c6
--- /dev/null
+++ b/src/include/access/gtm.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.h
+ *
+ * Module interfacing with GTM definitions
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ACCESS_GTM_H
+#define ACCESS_GTM_H
+
+#include "gtm/gtm_c.h"
+
+/* Configuration variables */
+extern char *GtmHost;
+extern int GtmPort;
+extern int GtmCoordinatorId;
+
+extern bool IsGTMConnected(void);
+extern void InitGTM(void);
+extern void CloseGTM(void);
+extern GlobalTransactionId BeginTranGTM(void);
+extern GlobalTransactionId BeginTranAutovacuumGTM(void);
+extern int CommitTranGTM(GlobalTransactionId gxid);
+extern int RollbackTranGTM(GlobalTransactionId gxid);
+extern GTM_Snapshot GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped);
+extern GTM_Sequence GetNextValGTM(char *seqname);
+extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment,
+ GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval,
+ bool cycle);
+extern int DropSequenceGTM(char *seqname);
+#endif /* ACCESS_GTM_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b23a663c53..a7a8230595 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/access/transam.h,v 1.68 2009/05/08 03:21:35 momjian Exp $
*
@@ -152,6 +153,11 @@ extern TransactionId TransactionIdLatest(TransactionId mainxid,
extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
/* in transam/varsup.c */
+#ifdef PGXC /* PGXC_DATANODE */
+extern void SetNextTransactionId(TransactionId xid);
+extern void SetForceXidFromGTM(bool value);
+extern bool GetForceXidFromGTM(void);
+#endif /* PGXC */
extern TransactionId GetNewTransactionId(bool isSubXact);
extern TransactionId ReadNewTransactionId(void);
extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 880b41b707..7cd8e165ec 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/access/xact.h,v 1.98 2009/06/11 14:49:09 momjian Exp $
*
@@ -18,7 +19,9 @@
#include "nodes/pg_list.h"
#include "storage/relfilenode.h"
#include "utils/timestamp.h"
-
+#ifdef PGXC /* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#endif
/*
* Xact isolation levels
@@ -145,6 +148,9 @@ extern TransactionId GetTopTransactionId(void);
extern TransactionId GetTopTransactionIdIfAny(void);
extern TransactionId GetCurrentTransactionId(void);
extern TransactionId GetCurrentTransactionIdIfAny(void);
+#ifdef PGXC /* PGXC_COORD */
+extern GlobalTransactionId GetCurrentGlobalTransactionId(void);
+#endif
extern SubTransactionId GetCurrentSubTransactionId(void);
extern CommandId GetCurrentCommandId(bool used);
extern TimestampTz GetCurrentTransactionStartTimestamp(void);
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index ab549eabb1..e8f96604ad 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.51 2009/01/01 17:23:56 momjian Exp $
*
@@ -71,6 +72,9 @@ typedef enum
StartupProcess,
BgWriterProcess,
WalWriterProcess
+#ifdef PGXC
+ ,PoolerProcess
+#endif
} AuxProcType;
#endif /* BOOTSTRAP_H */
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index fe04aab964..b2af292585 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/catalog/dependency.h,v 1.40 2009/06/11 14:49:09 momjian Exp $
*
@@ -146,6 +147,9 @@ typedef enum ObjectClass
OCLASS_FDW, /* pg_foreign_data_wrapper */
OCLASS_FOREIGN_SERVER, /* pg_foreign_server */
OCLASS_USER_MAPPING, /* pg_user_mapping */
+#ifdef PGXC
+ OCLASS_PGXC_CLASS, /* pgxc_class */
+#endif
MAX_OCLASS /* MUST BE LAST */
} ObjectClass;
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index 2d6eb3c34a..baa9ecaf49 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.91 2009/06/11 14:49:09 momjian Exp $
*
@@ -107,4 +108,11 @@ extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind);
extern void CheckAttributeType(const char *attname, Oid atttypid);
+#ifdef PGXC
+extern void AddRelationDistribution (Oid relid,
+ DistributeBy *distributeby,
+ List *parentOids,
+ TupleDesc descriptor);
+#endif
+
#endif /* HEAP_H */
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index ce117a8eec..5557021e30 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -7,6 +7,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/catalog/indexing.h,v 1.108 2009/06/11 14:49:09 momjian Exp $
*
@@ -267,6 +268,11 @@ DECLARE_UNIQUE_INDEX(pg_user_mapping_oid_index, 174, on pg_user_mapping using bt
DECLARE_UNIQUE_INDEX(pg_user_mapping_user_server_index, 175, on pg_user_mapping using btree(umuser oid_ops, umserver oid_ops));
#define UserMappingUserServerIndexId 175
+#ifdef PGXC
+DECLARE_UNIQUE_INDEX(pgxc_class_pcrelid_index, 9002, on pgxc_class using btree(pcrelid oid_ops));
+#define PgxcClassPgxcRelIdIndexId 9002
+#endif
+
/* last step of initialization script: build the indexes declared above */
BUILD_INDICES
diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h
new file mode 100644
index 0000000000..2104e53e42
--- /dev/null
+++ b/src/include/catalog/pgxc_class.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2004-2007 EnterpriseDB Corporation. All Rights Reserved.
+ */
+#ifndef PGXC_CLASS_H
+#define PGXC_CLASS_H
+
+#include "nodes/parsenodes.h"
+
+#define PgxcClassRelationId 9001
+
+CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS
+{
+ Oid pcrelid;
+ char pclocatortype;
+ int2 pcattnum;
+ int2 pchashalgorithm;
+ int2 pchashbuckets;
+} FormData_pgxc_class;
+
+typedef FormData_pgxc_class *Form_pgxc_class;
+
+#define Natts_pgxc_class 5
+
+#define Anum_pgxc_class_pcrelid 1
+#define Anum_pgxc_class_pclocatortype 2
+#define Anum_pgxc_class_pcattnum 3
+#define Anum_pgxc_class_pchashalgorithm 4
+#define Anum_pgxc_class_pchashbuckets 5
+
+extern void PgxcClassCreate(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets);
+
+extern void RemovePgxcClass(Oid pcrelid);
+
+#endif /* PGXC_CLASS_H */
+
diff --git a/src/include/gtm/assert.h b/src/include/gtm/assert.h
new file mode 100644
index 0000000000..5c71363832
--- /dev/null
+++ b/src/include/gtm/assert.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * assert.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_ASSERT_H
+#define GTM_ASSERT_H
+
+extern bool assert_enabled;
+
+/*
+ * USE_ASSERT_CHECKING, if defined, turns on all the assertions.
+ * - plai 9/5/90
+ *
+ * It should _NOT_ be defined in releases or in benchmark copies
+ */
+
+/*
+ * Trap
+ * Generates an exception if the given condition is true.
+ */
+#define Trap(condition, errorType) \
+ do { \
+ if ((assert_enabled) && (condition)) \
+ ExceptionalCondition(CppAsString(condition), (errorType), \
+ __FILE__, __LINE__); \
+ } while (0)
+
+/*
+ * TrapMacro is the same as Trap but it's intended for use in macros:
+ *
+ * #define foo(x) (AssertMacro(x != 0) && bar(x))
+ *
+ * Isn't CPP fun?
+ */
+#define TrapMacro(condition, errorType) \
+ ((bool) ((! assert_enabled) || ! (condition) || \
+ (ExceptionalCondition(CppAsString(condition), (errorType), \
+ __FILE__, __LINE__))))
+
+#ifndef USE_ASSERT_CHECKING
+#define Assert(condition)
+#define AssertMacro(condition) ((void)true)
+#define AssertArg(condition)
+#define AssertState(condition)
+#else
+#define Assert(condition) \
+ Trap(!(condition), "FailedAssertion")
+
+#define AssertMacro(condition) \
+ ((void) TrapMacro(!(condition), "FailedAssertion"))
+
+#define AssertArg(condition) \
+ Trap(!(condition), "BadArgument")
+
+#define AssertState(condition) \
+ Trap(!(condition), "BadState")
+#endif /* USE_ASSERT_CHECKING */
+
+extern int ExceptionalCondition(const char *conditionName,
+ const char *errorType,
+ const char *fileName, int lineNumber);
+
+#endif
diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h
new file mode 100644
index 0000000000..49c463fa3e
--- /dev/null
+++ b/src/include/gtm/elog.h
@@ -0,0 +1,253 @@
+/*-------------------------------------------------------------------------
+ *
+ * elog.h
+ * POSTGRES error reporting/logging definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/elog.h,v 1.98 2009/01/01 17:24:02 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ELOG_H
+#define ELOG_H
+
+/* Error level codes */
+#define DEBUG5 10 /* Debugging messages, in categories of
+ * decreasing detail. */
+#define DEBUG4 11
+#define DEBUG3 12
+#define DEBUG2 13
+#define DEBUG1 14 /* used by GUC debug_* variables */
+#define LOG 15 /* Server operational messages; sent only to
+ * server log by default. */
+#define COMMERROR 16 /* Client communication problems; same as LOG
+ * for server reporting, but never sent to
+ * client. */
+#define INFO 17 /* Messages specifically requested by user
+ * (eg VACUUM VERBOSE output); always sent to
+ * client regardless of client_min_messages,
+ * but by default not sent to server log. */
+#define NOTICE 18 /* Helpful messages to users about query
+ * operation; sent to client and server log
+ * by default. */
+#define WARNING 19 /* Warnings. NOTICE is for expected messages
+ * like implicit sequence creation by SERIAL.
+ * WARNING is for unexpected messages. */
+#define ERROR 20 /* user error - abort transaction; return to
+ * known state */
+#define ERROR2 21 /* user error - only send error message to the
+ * client */
+#define FATAL 22 /* fatal error - abort process */
+#define PANIC 23 /* take down the other backends with me */
+
+ /* #define DEBUG DEBUG1 */ /* Backward compatibility with pre-7.3 */
+
+
+/* Which __func__ symbol do we have, if any? */
+#ifdef HAVE_FUNCNAME__FUNC
+#define PG_FUNCNAME_MACRO __func__
+#else
+#ifdef HAVE_FUNCNAME__FUNCTION
+#define PG_FUNCNAME_MACRO __FUNCTION__
+#else
+#define PG_FUNCNAME_MACRO NULL
+#endif
+#endif
+
+/*
+ * ErrorData holds the data accumulated during any one ereport() cycle.
+ * Any non-NULL pointers must point to palloc'd data.
+ * (The const pointers are an exception; we assume they point at non-freeable
+ * constant strings.)
+ */
+typedef struct ErrorData
+{
+ int elevel; /* error level */
+ bool output_to_server; /* will report to server log? */
+ bool output_to_client; /* will report to client? */
+ bool show_funcname; /* true to force funcname inclusion */
+ const char *filename; /* __FILE__ of ereport() call */
+ int lineno; /* __LINE__ of ereport() call */
+ const char *funcname; /* __func__ of ereport() call */
+ const char *domain; /* message domain */
+ char *message; /* primary error message */
+ char *detail; /* detail error message */
+ char *detail_log; /* detail error message for server log only */
+ char *hint; /* hint message */
+ char *context; /* context message */
+ int saved_errno; /* errno at entry */
+} ErrorData;
+
+
+/*----------
+ * New-style error reporting API: to be used in this way:
+ * ereport(ERROR,
+ * (errcode(ERRCODE_UNDEFINED_CURSOR),
+ * errmsg("portal \"%s\" not found", stmt->portalname),
+ * ... other errxxx() fields as needed ...));
+ *
+ * The error level is required, and so is a primary error message (errmsg
+ * or errmsg_internal). All else is optional. errcode() defaults to
+ * ERRCODE_INTERNAL_ERROR if elevel is ERROR or more, ERRCODE_WARNING
+ * if elevel is WARNING, or ERRCODE_SUCCESSFUL_COMPLETION if elevel is
+ * NOTICE or below.
+ *
+ * ereport_domain() allows a message domain to be specified, for modules that
+ * wish to use a different message catalog from the backend's. To avoid having
+ * one copy of the default text domain per .o file, we define it as NULL here
+ * and have errstart insert the default text domain. Modules can either use
+ * ereport_domain() directly, or preferably they can override the TEXTDOMAIN
+ * macro.
+ *----------
+ */
+#define TEXTDOMAIN "GTM"
+
+#define ereport_domain(elevel, domain, rest) \
+ (errstart(elevel, __FILE__, __LINE__, PG_FUNCNAME_MACRO, domain) ? \
+ (errfinish rest) : (void) 0)
+
+#define ereport(level, rest) \
+ ereport_domain(level, TEXTDOMAIN, rest)
+
+
+#define PG_RE_THROW() pg_re_throw()
+
+extern bool errstart(int elevel, const char *filename, int lineno,
+ const char *funcname, const char *domain);
+extern void errfinish(int dummy,...);
+
+extern int
+errmsg(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errmsg_internal(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errdetail(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errdetail_log(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errhint(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+/*----------
+ * Old-style error reporting API: to be used in this way:
+ * elog(ERROR, "portal \"%s\" not found", stmt->portalname);
+ *----------
+ */
+#define elog elog_start(__FILE__, __LINE__, PG_FUNCNAME_MACRO), elog_finish
+
+extern void elog_start(const char *filename, int lineno, const char *funcname);
+extern void
+elog_finish(int elevel, const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 2, 3)));
+
+/*----------
+ * API for catching ereport(ERROR) exits. Use these macros like so:
+ *
+ * PG_TRY();
+ * {
+ * ... code that might throw ereport(ERROR) ...
+ * }
+ * PG_CATCH();
+ * {
+ * ... error recovery code ...
+ * }
+ * PG_END_TRY();
+ *
+ * (The braces are not actually necessary, but are recommended so that
+ * pg_indent will indent the construct nicely.) The error recovery code
+ * can optionally do PG_RE_THROW() to propagate the same error outwards.
+ *
+ * Note: while the system will correctly propagate any new ereport(ERROR)
+ * occurring in the recovery section, there is a small limit on the number
+ * of levels this will work for. It's best to keep the error recovery
+ * section simple enough that it can't generate any new errors, at least
+ * not before popping the error stack.
+ *
+ * Note: an ereport(FATAL) will not be caught by this construct; control will
+ * exit straight through proc_exit(). Therefore, do NOT put any cleanup
+ * of non-process-local resources into the error recovery section, at least
+ * not without taking thought for what will happen during ereport(FATAL).
+ * The PG_ENSURE_ERROR_CLEANUP macros provided by storage/ipc.h may be
+ * helpful in such cases.
+ *----------
+ */
+#define PG_TRY() \
+ do { \
+ sigjmp_buf *save_exception_stack = PG_exception_stack; \
+ sigjmp_buf local_sigjmp_buf; \
+ if (sigsetjmp(local_sigjmp_buf, 0) == 0) \
+ { \
+ PG_exception_stack = &local_sigjmp_buf
+
+#define PG_CATCH() \
+ } \
+ else \
+ { \
+ PG_exception_stack = save_exception_stack; \
+
+#define PG_END_TRY() \
+ } \
+ PG_exception_stack = save_exception_stack; \
+ } while (0)
+
+int errfunction(const char *funcname);
+
+extern void EmitErrorReport(void *port);
+
+/* GUC-configurable parameters */
+
+typedef enum
+{
+ PGERROR_TERSE, /* single-line error messages */
+ PGERROR_DEFAULT, /* recommended style */
+ PGERROR_VERBOSE /* all the facts, ma'am */
+} PGErrorVerbosity;
+
+/* Log destination bitmap */
+#define LOG_DESTINATION_STDERR 1
+#define LOG_DESTINATION_SYSLOG 2
+#define LOG_DESTINATION_EVENTLOG 4
+#define LOG_DESTINATION_CSVLOG 8
+
+/* Other exported functions */
+extern void pg_re_throw(void);
+extern void DebugFileOpen(void);
+extern void FlushErrorState(void);
+
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available). Used before ereport/elog can be used
+ * safely (memory context, GUC load etc)
+ */
+extern void
+write_stderr(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+#endif /* ELOG_H */
diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h
new file mode 100644
index 0000000000..37e23a7ffa
--- /dev/null
+++ b/src/include/gtm/gtm.h
@@ -0,0 +1,140 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_H
+#define _GTM_H
+
+#include <setjmp.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_conn.h"
+#include "gtm/elog.h"
+#include "gtm/gtm_list.h"
+
+extern char *GTMLogFile;
+
+typedef enum GTM_ThreadStatus
+{
+ GTM_THREAD_STARTING,
+ GTM_THREAD_RUNNING,
+ GTM_THREAD_EXITING,
+ /* Must be the last */
+ GTM_THREAD_INVALID
+} GTM_ThreadStatus;
+
+struct GTM_ConnectionInfo;
+
+#define ERRORDATA_STACK_SIZE 5
+
+typedef struct GTM_ThreadInfo
+{
+ /*
+ * Thread specific information such as connection(s) served by it
+ */
+ GTM_ThreadID thr_id;
+ uint32 thr_localid;
+ void * (* thr_startroutine)(void *);
+
+ MemoryContext thr_thread_context;
+ MemoryContext thr_message_context;
+ MemoryContext thr_current_context;
+ MemoryContext thr_error_context;
+ MemoryContext thr_parent_context;
+
+ sigjmp_buf *thr_sigjmp_buf;
+
+ ErrorData thr_error_data[ERRORDATA_STACK_SIZE];
+ int thr_error_stack_depth;
+ int thr_error_recursion_depth;
+ int thr_criticalsec_count;
+
+ GTM_ThreadStatus thr_status;
+ GTM_ConnectionInfo *thr_conn;
+
+ GTM_RWLock thr_lock;
+ List *thr_cached_txninfo;
+
+} GTM_ThreadInfo;
+
+typedef struct GTM_Threads
+{
+ uint32 gt_thread_count;
+ uint32 gt_array_size;
+ GTM_ThreadInfo **gt_threads;
+ GTM_RWLock gt_lock;
+} GTM_Threads;
+
+extern GTM_Threads *GTMThreads;
+
+int GTM_ThreadAdd(GTM_ThreadInfo *thrinfo);
+int GTM_ThreadRemove(GTM_ThreadInfo *thrinfo);
+int GTM_ThreadJoin(GTM_ThreadInfo *thrinfo);
+void GTM_ThreadExit(void);
+void ConnFree(Port *port);
+
+GTM_ThreadInfo *GTM_ThreadCreate(GTM_ConnectionInfo *conninfo,
+ void *(* startroutine)(void *));
+GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid);
+
+/*
+ * pthread keys to get thread specific information
+ */
+extern pthread_key_t threadinfo_key;
+extern MemoryContext TopMostMemoryContext;
+
+#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo))
+#define GetMyThreadInfo ((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key))
+
+#define TopMemoryContext (GetMyThreadInfo->thr_thread_context)
+#define ThreadTopContext (GetMyThreadInfo->thr_thread_context)
+#define MessageContext (GetMyThreadInfo->thr_message_context)
+#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context)
+#define ErrorContext (GetMyThreadInfo->thr_error_context)
+#define errordata (GetMyThreadInfo->thr_error_data)
+#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth)
+#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth)
+#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count)
+
+#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf)
+#define MyConnection (GetMyThreadInfo->thr_conn)
+#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \
+ GetMyThreadInfo->thr_conn->con_port : \
+ NULL)
+#define MyThreadID (GetMyThreadInfo->thr_id)
+
+#define GTM_CachedTransInfo (GetMyThreadInfo->thr_cached_txninfo)
+#define GTM_HaveFreeCachedTransInfo() (list_length(GTM_CachedTransInfo))
+
+#define GTM_MAX_CACHED_TRANSINFO 0
+#define GTM_HaveEnoughCachedTransInfo() (list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO)
+
+#define START_CRIT_SECTION() (CritSectionCount++)
+
+#define END_CRIT_SECTION() \
+ do { \
+ Assert(CritSectionCount > 0); \
+ CritSectionCount--; \
+ } while(0)
+
+
+#if 0
+
+/* Coordinator registration */
+int GTM_RegisterCoordinator(GTM_CoordInfo *cinfo);
+int GTM_UnregisterCoordinator(GTM_CoordinatorId cid);
+
+#endif
+
+#endif
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
new file mode 100644
index 0000000000..1a04064b6d
--- /dev/null
+++ b/src/include/gtm/gtm_c.h
@@ -0,0 +1,101 @@
+/*-------------------------------------------------------------------------
+ *
+ * c.h
+ * Fundamental C definitions. This is included by every .c file in
+ * PostgreSQL (via either postgres.h or postgres_fe.h, as appropriate).
+ *
+ * Note that the definitions here are not intended to be exposed to clients
+ * of the frontend interface libraries --- so we don't worry much about
+ * polluting the namespace with lots of stuff...
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/c.h,v 1.234 2009/01/01 17:23:55 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_C_H
+#define GTM_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdarg.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#include <sys/types.h>
+
+#include <errno.h>
+#include <pthread.h>
+#include "c.h"
+
+typedef uint32 GlobalTransactionId; /* 32-bit global transaction ids */
+typedef uint32 PGXC_NodeId;
+typedef uint32 GTM_CoordinatorId;
+typedef int16 GTMProxy_ConnID;
+
+#define InvalidGTMProxyConnID -1
+
+typedef pthread_t GTM_ThreadID;
+
+/*
+ * A unique handle to identify transaction at the GTM. It could just be
+ * an index in an array or a pointer to the structure
+ *
+ * Note: If we get rid of BEGIN transaction at the GTM, we can use GXID
+ * as a handle because we would never have a transaction state at the
+ * GTM without assigned GXID.
+ */
+typedef int32 GTM_TransactionHandle;
+
+#define InvalidTransactionHandle -1
+
+typedef int64 GTM_Sequence; /* a 64-bit sequence */
+typedef struct GTM_SequenceKeyData
+{
+ uint32 gsk_keylen;
+ char *gsk_key;
+} GTM_SequenceKeyData; /* Counter key, set by the client */
+
+typedef GTM_SequenceKeyData *GTM_SequenceKey;
+#define GTM_MAX_SEQKEY_LENGTH 1024
+
+#define InvalidSequenceValue 0x7fffffffffffffffLL
+#define SEQVAL_IS_VALID(v) ((v) != InvalidSequenceValue)
+
+#define GTM_MAX_GLOBAL_TRANSACTIONS 4096
+
+typedef enum GTM_IsolationLevel
+{
+ GTM_ISOLATION_SERIALIZABLE, /* serializable txn */
+ GTM_ISOLATION_RC /* read-committed txn */
+} GTM_IsolationLevel;
+
+typedef struct GTM_SnapshotData
+{
+ GlobalTransactionId sn_xmin;
+ GlobalTransactionId sn_xmax;
+ GlobalTransactionId sn_recent_global_xmin;
+ uint32 sn_xcnt;
+ GlobalTransactionId *sn_xip;
+} GTM_SnapshotData;
+
+typedef GTM_SnapshotData *GTM_Snapshot;
+
+typedef struct GTM_StartupPacket {
+ GTM_CoordinatorId sp_cid;
+ bool sp_isproxy;
+} GTM_StartupPacket;
+
+#define InvalidGlobalTransactionId ((GlobalTransactionId) 0)
+
+#define GlobalTransactionIdIsValid(gxid) ((GlobalTransactionId) (gxid)) != InvalidGlobalTransactionId
+
+#define _(x) gettext(x)
+
+#endif /* GTM_C_H */
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
new file mode 100644
index 0000000000..29eeaf95f9
--- /dev/null
+++ b/src/include/gtm/gtm_client.h
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_client.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_CLIENT_H
+#define GTM_CLIENT_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-fe.h"
+
+typedef union GTM_ResultData
+{
+ GTM_TransactionHandle grd_txnhandle; /* TXN_BEGIN */
+ GlobalTransactionId grd_gxid; /* TXN_BEGIN_GETGXID
+ * TXN_PREPARE
+ * TXN_COMMIT
+ * TXN_ROLLBACK
+ */
+
+ struct
+ {
+ GTM_TransactionHandle txnhandle;
+ GlobalTransactionId gxid;
+ } grd_txn; /* TXN_GET_GXID
+ * SNAPSHOT_GET
+ * SNAPSHOT_GXID_GET */
+
+ GTM_SequenceKeyData grd_seqkey; /* SEQUENCE_INIT
+ * SEQUENCE_RESET
+ * SEQUENCE_CLOSE */
+ struct
+ {
+ GTM_SequenceKeyData seqkey;
+ GTM_Sequence seqval;
+ } grd_seq; /* SEQUENCE_GET_CURRENT
+ SEQUENCE_GET_NEXT */
+
+ struct
+ {
+ int txn_count; /* TXN_BEGIN_GETGXID_MULTI */
+ GlobalTransactionId start_gxid;
+ } grd_txn_get_multi;
+
+ struct
+ {
+ int txn_count; /* TXN_COMMIT_MULTI */
+ int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+ } grd_txn_rc_multi;
+
+ struct
+ {
+ int txn_count; /* GET_SNAPSHOT_MULTI */
+ int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+ } grd_txn_snap_multi;
+
+ /*
+ * TODO
+ * TXN_GET_STATUS
+ * TXN_GET_ALL_PREPARED
+ */
+} GTM_ResultData;
+
+typedef struct GTM_Result
+{
+ GTM_ResultType gr_type;
+ int gr_msglen;
+ int gr_status;
+ GTM_ProxyMsgHeader gr_proxyhdr;
+ GTM_ResultData gr_resdata;
+ /*
+ * We keep these two items outside the union to avoid repeated malloc/free
+ * of the xip array. If these items are pushed inside the union, they may
+ * get overwritten by other members in the union
+ */
+ int gr_xip_size;
+ GTM_SnapshotData gr_snapshot;
+
+ /*
+ * Similarly, keep the buffer for proxying data outside the union
+ */
+ char *gr_proxy_data;
+ int gr_proxy_datalen;
+} GTM_Result;
+
+/*
+ * Connection Management API
+ */
+GTM_Conn *connect_gtm(const char *connect_string);
+void disconnect_gtm(GTM_Conn *conn);
+
+/*
+ * Transaction Management API
+ */
+GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel);
+GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel);
+int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
+int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
+int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
+ int nodecnt, PGXC_NodeId nodes[]);
+
+/*
+ * Snapshot Management API
+ */
+GTM_SnapshotData *get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid,
+ bool canbe_grouped);
+
+/*
+ * Sequence Management API
+ */
+int open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
+ GTM_Sequence minval, GTM_Sequence maxval,
+ GTM_Sequence startval, bool cycle);
+int close_sequence(GTM_Conn *conn, GTM_SequenceKey key);
+GTM_Sequence get_current(GTM_Conn *conn, GTM_SequenceKey key);
+GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key);
+int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
+
+
+#endif
diff --git a/src/include/gtm/gtm_conn.h b/src/include/gtm/gtm_conn.h
new file mode 100644
index 0000000000..911a345c4f
--- /dev/null
+++ b/src/include/gtm/gtm_conn.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_conn.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_CONN_H
+#define GTM_CONN_H
+
+#include "gtm/libpq-be.h"
+
+struct GTM_ThreadInfo;
+
+typedef struct GTM_ConnectionInfo
+{
+ /* Port contains all the vital information about this connection */
+ Port *con_port;
+ struct GTM_ThreadInfo *con_thrinfo;
+ bool con_authenticated;
+} GTM_ConnectionInfo;
+
+typedef struct GTM_Connections
+{
+ uint32 gc_conn_count;
+ uint32 gc_array_size;
+ GTM_ConnectionInfo *gc_connections;
+ GTM_RWLock gc_lock;
+} GTM_Connections;
+
+
+#endif
diff --git a/src/include/gtm/gtm_ext.h b/src/include/gtm/gtm_ext.h
new file mode 100644
index 0000000000..b492941779
--- /dev/null
+++ b/src/include/gtm/gtm_ext.h
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_ext.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_EXT_H
+#define GTM_EXT_H
+
+/*
+ * Identifiers of error message fields. Kept here to keep common
+ * between frontend and backend, and also to export them to libpq
+ * applications.
+ */
+#define PG_DIAG_SEVERITY 'S'
+#define PG_DIAG_MESSAGE_PRIMARY 'M'
+#define PG_DIAG_MESSAGE_DETAIL 'D'
+#define PG_DIAG_MESSAGE_HINT 'H'
+#define PG_DIAG_SOURCE_FILE 'F'
+#define PG_DIAG_SOURCE_LINE 'L'
+#define PG_DIAG_SOURCE_FUNCTION 'R'
+
+
+#endif
diff --git a/src/include/gtm/gtm_ip.h b/src/include/gtm/gtm_ip.h
new file mode 100644
index 0000000000..30da3081d3
--- /dev/null
+++ b/src/include/gtm/gtm_ip.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.h
+ * Definitions for IPv6-aware network access.
+ *
+ * These definitions are used by both frontend and backend code. Be careful
+ * what you include here!
+ *
+ * Copyright (c) 2003-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IP_H
+#define IP_H
+
+#include "gtm/pqcomm.h"
+
+
+extern int gtm_getaddrinfo_all(const char *hostname, const char *servname,
+ const struct addrinfo * hintp,
+ struct addrinfo ** result);
+extern void gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai);
+
+extern int gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+ char *node, int nodelen,
+ char *service, int servicelen,
+ int flags);
+
+extern int gtm_range_sockaddr(const struct sockaddr_storage * addr,
+ const struct sockaddr_storage * netaddr,
+ const struct sockaddr_storage * netmask);
+
+extern int gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask,
+ char *numbits, int family);
+
+#ifdef HAVE_IPV6
+extern void gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr);
+extern void gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr);
+#endif
+
+#ifdef HAVE_UNIX_SOCKETS
+#define IS_AF_UNIX(fam) ((fam) == AF_UNIX)
+#else
+#define IS_AF_UNIX(fam) (0)
+#endif
+
+#endif /* IP_H */
diff --git a/src/include/gtm/gtm_list.h b/src/include/gtm/gtm_list.h
new file mode 100644
index 0000000000..6a5727f36a
--- /dev/null
+++ b/src/include/gtm/gtm_list.h
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_list.h
+ * interface for PostgreSQL generic linked list package
+ *
+ * This package implements singly-linked homogeneous lists.
+ *
+ * It is important to have constant-time length, append, and prepend
+ * operations. To achieve this, we deal with two distinct data
+ * structures:
+ *
+ * 1. A set of "list cells": each cell contains a data field and
+ * a link to the next cell in the list or NULL.
+ * 2. A single structure containing metadata about the list: the
+ * type of the list, pointers to the head and tail cells, and
+ * the length of the list.
+ *
+ * We support three types of lists:
+ *
+ * T_List: lists of pointers
+ * (in practice usually pointers to Nodes, but not always;
+ * declared as "void *" to minimize casting annoyances)
+ * T_IntList: lists of integers
+ * T_OidList: lists of Oids
+ *
+ * (At the moment, ints and Oids are the same size, but they may not
+ * always be so; try to be careful to maintain the distinction.)
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/nodes/pg_list.h,v 1.59 2008/08/14 18:48:00 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_LIST_H
+#define GTM_LIST_H
+
+
+typedef struct ListCell ListCell;
+
+typedef struct List
+{
+ int length;
+ ListCell *head;
+ ListCell *tail;
+} List;
+
+struct ListCell
+{
+ union
+ {
+ void *ptr_value;
+ int int_value;
+ } data;
+ ListCell *next;
+};
+
+/*
+ * The *only* valid representation of an empty list is NIL; in other
+ * words, a non-NIL list is guaranteed to have length >= 1 and
+ * head/tail != NULL
+ */
+#define NIL ((List *) NULL)
+
+/*
+ * These routines are used frequently. However, we can't implement
+ * them as macros, since we want to avoid double-evaluation of macro
+ * arguments. Therefore, we implement them using GCC inline functions,
+ * and as regular functions with non-GCC compilers.
+ */
+#ifdef __GNUC__
+
+static __inline__ ListCell *
+list_head(List *l)
+{
+ return l ? l->head : NULL;
+}
+
+static __inline__ ListCell *
+list_tail(List *l)
+{
+ return l ? l->tail : NULL;
+}
+
+static __inline__ int
+list_length(List *l)
+{
+ return l ? l->length : 0;
+}
+#else
+
+extern ListCell *list_head(List *l);
+extern ListCell *list_tail(List *l);
+extern int list_length(List *l);
+#endif /* __GNUC__ */
+
+/*
+ * NB: There is an unfortunate legacy from a previous incarnation of
+ * the List API: the macro lfirst() was used to mean "the data in this
+ * cons cell". To avoid changing every usage of lfirst(), that meaning
+ * has been kept. As a result, lfirst() takes a ListCell and returns
+ * the data it contains; to get the data in the first cell of a
+ * List, use linitial(). Worse, lsecond() is more closely related to
+ * linitial() than lfirst(): given a List, lsecond() returns the data
+ * in the second cons cell.
+ */
+
+#define lnext(lc) ((lc)->next)
+#define lfirst(lc) ((lc)->data.ptr_value)
+#define lfirst_int(lc) ((lc)->data.int_value)
+
+#define linitial(l) lfirst(list_head(l))
+#define linitial_int(l) lfirst_int(list_head(l))
+
+#define lsecond(l) lfirst(lnext(list_head(l)))
+#define lsecond_int(l) lfirst_int(lnext(list_head(l)))
+
+#define lthird(l) lfirst(lnext(lnext(list_head(l))))
+#define lthird_int(l) lfirst_int(lnext(lnext(list_head(l))))
+
+#define lfourth(l) lfirst(lnext(lnext(lnext(list_head(l)))))
+#define lfourth_int(l) lfirst_int(lnext(lnext(lnext(list_head(l)))))
+
+#define llast(l) lfirst(list_tail(l))
+#define llast_int(l) lfirst_int(list_tail(l))
+
+/*
+ * Convenience macros for building fixed-length lists
+ */
+#define list_make1(x1) lcons(x1, NIL)
+#define list_make2(x1,x2) lcons(x1, list_make1(x2))
+#define list_make3(x1,x2,x3) lcons(x1, list_make2(x2, x3))
+#define list_make4(x1,x2,x3,x4) lcons(x1, list_make3(x2, x3, x4))
+
+#define list_make1_int(x1) lcons_int(x1, NIL)
+#define list_make2_int(x1,x2) lcons_int(x1, list_make1_int(x2))
+#define list_make3_int(x1,x2,x3) lcons_int(x1, list_make2_int(x2, x3))
+#define list_make4_int(x1,x2,x3,x4) lcons_int(x1, list_make3_int(x2, x3, x4))
+
+/*
+ * foreach -
+ * a convenience macro which loops through the list
+ */
+#define foreach(cell, l) \
+ for ((cell) = list_head(l); (cell) != NULL; (cell) = lnext(cell))
+
+/*
+ * for_each_cell -
+ * a convenience macro which loops through a list starting from a
+ * specified cell
+ */
+#define for_each_cell(cell, initcell) \
+ for ((cell) = (initcell); (cell) != NULL; (cell) = lnext(cell))
+
+/*
+ * forboth -
+ * a convenience macro for advancing through two linked lists
+ * simultaneously. This macro loops through both lists at the same
+ * time, stopping when either list runs out of elements. Depending
+ * on the requirements of the call site, it may also be wise to
+ * assert that the lengths of the two lists are equal.
+ */
+#define forboth(cell1, list1, cell2, list2) \
+ for ((cell1) = list_head(list1), (cell2) = list_head(list2); \
+ (cell1) != NULL && (cell2) != NULL; \
+ (cell1) = lnext(cell1), (cell2) = lnext(cell2))
+
+extern List *lappend(List *list, void *datum);
+extern List *lappend_int(List *list, int datum);
+
+extern ListCell *lappend_cell(List *list, ListCell *prev, void *datum);
+extern ListCell *lappend_cell_int(List *list, ListCell *prev, int datum);
+
+extern List *lcons(void *datum, List *list);
+extern List *lcons_int(int datum, List *list);
+
+extern List *list_concat(List *list1, List *list2);
+extern List *list_truncate(List *list, int new_size);
+
+extern void *list_nth(List *list, int n);
+extern int list_nth_int(List *list, int n);
+
+extern bool list_member(List *list, void *datum);
+extern bool list_member_ptr(List *list, void *datum);
+extern bool list_member_int(List *list, int datum);
+
+extern List *list_delete(List *list, void *datum);
+extern List *list_delete_ptr(List *list, void *datum);
+extern List *list_delete_int(List *list, int datum);
+extern List *list_delete_first(List *list);
+extern List *list_delete_cell(List *list, ListCell *cell, ListCell *prev);
+
+extern List *list_union(List *list1, List *list2);
+extern List *list_union_ptr(List *list1, List *list2);
+extern List *list_union_int(List *list1, List *list2);
+
+extern List *list_intersection(List *list1, List *list2);
+/* currently, there's no need for list_intersection_int etc */
+
+extern List *list_difference(List *list1, List *list2);
+extern List *list_difference_ptr(List *list1, List *list2);
+extern List *list_difference_int(List *list1, List *list2);
+
+extern List *list_append_unique(List *list, void *datum);
+extern List *list_append_unique_ptr(List *list, void *datum);
+extern List *list_append_unique_int(List *list, int datum);
+
+extern List *list_concat_unique(List *list1, List *list2);
+extern List *list_concat_unique_ptr(List *list1, List *list2);
+extern List *list_concat_unique_int(List *list1, List *list2);
+
+extern void list_free(List *list);
+extern void list_free_deep(List *list);
+
+extern List *list_copy(List *list);
+extern List *list_copy_tail(List *list, int nskip);
+
+/*
+ * To ease migration to the new list API, a set of compatibility
+ * macros are provided that reduce the impact of the list API changes
+ * as far as possible. Until client code has been rewritten to use the
+ * new list API, the ENABLE_LIST_COMPAT symbol can be defined before
+ * including pg_list.h
+ */
+#ifdef ENABLE_LIST_COMPAT
+
+#define lfirsti(lc) lfirst_int(lc)
+
+#define makeList1(x1) list_make1(x1)
+#define makeList2(x1, x2) list_make2(x1, x2)
+#define makeList3(x1, x2, x3) list_make3(x1, x2, x3)
+#define makeList4(x1, x2, x3, x4) list_make4(x1, x2, x3, x4)
+
+#define makeListi1(x1) list_make1_int(x1)
+#define makeListi2(x1, x2) list_make2_int(x1, x2)
+
+#define lconsi(datum, list) lcons_int(datum, list)
+
+#define lappendi(list, datum) lappend_int(list, datum)
+
+#define nconc(l1, l2) list_concat(l1, l2)
+
+#define nth(n, list) list_nth(list, n)
+
+#define member(datum, list) list_member(list, datum)
+#define ptrMember(datum, list) list_member_ptr(list, datum)
+#define intMember(datum, list) list_member_int(list, datum)
+
+/*
+ * Note that the old lremove() determined equality via pointer
+ * comparison, whereas the new list_delete() uses equal(); in order to
+ * keep the same behavior, we therefore need to map lremove() calls to
+ * list_delete_ptr() rather than list_delete()
+ */
+#define lremove(elem, list) list_delete_ptr(list, elem)
+#define LispRemove(elem, list) list_delete(list, elem)
+#define lremovei(elem, list) list_delete_int(list, elem)
+
+#define ltruncate(n, list) list_truncate(list, n)
+
+#define set_union(l1, l2) list_union(l1, l2)
+#define set_ptrUnion(l1, l2) list_union_ptr(l1, l2)
+
+#define set_difference(l1, l2) list_difference(l1, l2)
+#define set_ptrDifference(l1, l2) list_difference_ptr(l1, l2)
+
+#define equali(l1, l2) equal(l1, l2)
+#define equalo(l1, l2) equal(l1, l2)
+
+#define freeList(list) list_free(list)
+
+#define listCopy(list) list_copy(list)
+
+extern int length(List *list);
+#endif /* ENABLE_LIST_COMPAT */
+
+#endif /* GTM_LIST_H */
diff --git a/src/include/gtm/gtm_lock.h b/src/include/gtm/gtm_lock.h
new file mode 100644
index 0000000000..f4a5e025ba
--- /dev/null
+++ b/src/include/gtm/gtm_lock.h
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_lock.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef GTM_LOCK_H
+#define GTM_LOCK_H
+
+#include <pthread.h>
+
+typedef struct GTM_RWLock
+{
+ pthread_rwlock_t lk_lock;
+} GTM_RWLock;
+
+typedef struct GTM_MutexLock
+{
+ pthread_mutex_t lk_lock;
+} GTM_MutexLock;
+
+typedef enum GTM_LockMode
+{
+ GTM_LOCKMODE_WRITE,
+ GTM_LOCKMODE_READ
+} GTM_LockMode;
+
+typedef struct GTM_CV
+{
+ pthread_cond_t cv_condvar;
+} GTM_CV;
+
+extern bool GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode);
+extern bool GTM_RWLockRelease(GTM_RWLock *lock);
+extern int GTM_RWLockInit(GTM_RWLock *lock);
+extern int GTM_RWLockDestroy(GTM_RWLock *lock);
+extern bool GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode);
+
+extern bool GTM_MutexLockAcquire(GTM_MutexLock *lock);
+extern bool GTM_MutexLockRelease(GTM_MutexLock *lock);
+extern int GTM_MutexLockInit(GTM_MutexLock *lock);
+extern int GTM_MutexLockDestroy(GTM_MutexLock *lock);
+extern bool GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock);
+
+extern int GTM_CVInit(GTM_CV *cv);
+extern int GTM_CVDestroy(GTM_CV *cv);
+extern int GTM_CVSignal(GTM_CV *cv);
+extern int GTM_CVBcast(GTM_CV *cv);
+extern int GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock);
+
+#endif
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
new file mode 100644
index 0000000000..cae061437d
--- /dev/null
+++ b/src/include/gtm/gtm_msg.h
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_msg.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_MSG_H
+#define GTM_MSG_H
+
+typedef enum GTM_MessageType
+{
+ MSG_TYPE_INVALID,
+ MSG_REGISTER_COORD, /* Register a Coordinator with GTM */
+ MSG_UNREGISTER_COORD, /* Unregister a Coordinator with GTM */
+ MSG_TXN_BEGIN, /* Start a new transaction */
+ MSG_TXN_BEGIN_GETGXID, /* Start a new transaction and get GXID */
+ MSG_TXN_BEGIN_GETGXID_MULTI, /* Start multiple new transactions and get GXIDs */
+ MSG_TXN_PREPARE, /* Prepare a transation for commit */
+ MSG_TXN_COMMIT, /* Commit a running or prepared transaction */
+ MSG_TXN_COMMIT_MULTI, /* Commit multiple running or prepared transactions */
+ MSG_TXN_ROLLBACK, /* Rollback a transaction */
+ MSG_TXN_ROLLBACK_MULTI, /* Rollback multiple transactions */
+ MSG_TXN_GET_GXID, /* Get a GXID for a transaction */
+ MSG_SNAPSHOT_GET, /* Get a global snapshot */
+ MSG_SNAPSHOT_GET_MULTI, /* Get multiple global snapshots */
+ MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */
+ MSG_SEQUENCE_INIT, /* Initialize a new global sequence */
+ MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */
+ MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */
+ MSG_SEQUENCE_RESET, /* Reset the sequence */
+ MSG_SEQUENCE_CLOSE, /* Close a previously inited sequence */
+ MSG_TXN_GET_STATUS, /* Get status of a given transaction */
+ MSG_TXN_GET_ALL_PREPARED, /* Get information about all outstanding
+ * prepared transactions */
+ MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Start a new transaction and get GXID for autovacuum */
+ MSG_DATA_FLUSH, /* flush pending data */
+ MSG_BACKEND_DISCONNECT, /* tell GTM that the backend diconnected from the proxy */
+
+ /*
+ * Must be at the end
+ */
+ MSG_TYPE_COUNT /* A dummmy entry just to count the message types */
+} GTM_MessageType;
+
+typedef enum GTM_ResultType
+{
+ TXN_BEGIN_RESULT,
+ TXN_BEGIN_GETGXID_RESULT,
+ TXN_BEGIN_GETGXID_MULTI_RESULT,
+ TXN_PREPARE_RESULT,
+ TXN_COMMIT_RESULT,
+ TXN_COMMIT_MULTI_RESULT,
+ TXN_ROLLBACK_RESULT,
+ TXN_ROLLBACK_MULTI_RESULT,
+ TXN_GET_GXID_RESULT,
+ SNAPSHOT_GET_RESULT,
+ SNAPSHOT_GET_MULTI_RESULT,
+ SNAPSHOT_GXID_GET_RESULT,
+ SEQUENCE_INIT_RESULT,
+ SEQUENCE_GET_CURRENT_RESULT,
+ SEQUENCE_GET_NEXT_RESULT,
+ SEQUENCE_RESET_RESULT,
+ SEQUENCE_CLOSE_RESULT,
+ TXN_GET_STATUS_RESULT,
+ TXN_GET_ALL_PREPARED_RESULT,
+ TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT,
+} GTM_ResultType;
+
+/*
+ * Special message header for the messgaes exchanged between the GTM server and
+ * the proxy.
+ *
+ * ph_conid: connection identifier which is used to route
+ * the messages to the right backend.
+ */
+typedef struct GTM_ProxyMsgHeader
+{
+ GTMProxy_ConnID ph_conid;
+} GTM_ProxyMsgHeader;
+
+#endif
diff --git a/src/include/gtm/gtm_proxy.h b/src/include/gtm/gtm_proxy.h
new file mode 100644
index 0000000000..8dc16bca0e
--- /dev/null
+++ b/src/include/gtm/gtm_proxy.h
@@ -0,0 +1,221 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_proxy.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_PROXY_H
+#define _GTM_PROXY_H
+
+#include <setjmp.h>
+#include <poll.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_conn.h"
+#include "gtm/elog.h"
+#include "gtm/gtm_list.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-fe.h"
+
+extern char *GTMProxyLogFile;
+
+typedef enum GTMProxy_ThreadStatus
+{
+ GTM_PROXY_THREAD_STARTING,
+ GTM_PROXY_THREAD_RUNNING,
+ GTM_PROXY_THREAD_EXITING,
+ /* Must be the last */
+ GTM_PROXY_THREAD_INVALID
+} GTMProxy_ThreadStatus;
+
+typedef struct GTMProxy_ConnectionInfo
+{
+ /* Port contains all the vital information about this connection */
+ Port *con_port;
+ struct GTMProxy_ThreadInfo *con_thrinfo;
+ bool con_authenticated;
+ bool con_disconnected;
+ GTMProxy_ConnID con_id;
+
+ GTM_MessageType con_pending_msg;
+ GlobalTransactionId con_txid;
+ GTM_TransactionHandle con_handle;
+} GTMProxy_ConnectionInfo;
+
+typedef struct GTMProxy_Connections
+{
+ uint32 gc_conn_count;
+ uint32 gc_array_size;
+ GTMProxy_ConnectionInfo *gc_connections;
+ GTM_RWLock gc_lock;
+} GTMProxy_Connections;
+
+#define ERRORDATA_STACK_SIZE 5
+#define GTM_PROXY_MAX_CONNECTIONS 1024
+
+typedef struct GTMProxy_ThreadInfo
+{
+ /*
+ * Thread specific information such as connection(s) served by it
+ */
+ GTM_ThreadID thr_id;
+ uint32 thr_localid;
+ void * (* thr_startroutine)(void *);
+
+ MemoryContext thr_thread_context;
+ MemoryContext thr_message_context;
+ MemoryContext thr_current_context;
+ MemoryContext thr_error_context;
+ MemoryContext thr_parent_context;
+
+ sigjmp_buf *thr_sigjmp_buf;
+
+ ErrorData thr_error_data[ERRORDATA_STACK_SIZE];
+ int thr_error_stack_depth;
+ int thr_error_recursion_depth;
+ int thr_criticalsec_count;
+
+ GTMProxy_ThreadStatus thr_status;
+ GTMProxy_ConnectionInfo *thr_conn; /* Current active */
+
+ /*
+ * The structure member type/sequence upto this point must match the
+ * GTM_ThreadInfo structure in gtm.h since they are shared in some common
+ * library routines such as elog.c. Keeping them in sync helps us use the
+ * same library for the proxy as well as the server.
+ */
+ GTM_MutexLock thr_lock;
+ GTM_CV thr_cv;
+
+ /*
+ * We use a sequence number to track the state of connection/fd array.
+ * Whenever a new connection is added or an existing connection is deleted
+ * from the connection array, the sequence number is incremented. The
+ * thread main routine can then reconstruct the fd array again.
+ */
+ int32 thr_seqno;
+
+ /* number of connections served by this thread */
+ uint32 thr_conn_count;
+
+ /* connection array */
+ GTMProxy_ConnectionInfo *thr_all_conns[GTM_PROXY_MAX_CONNECTIONS];
+ struct pollfd thr_poll_fds[GTM_PROXY_MAX_CONNECTIONS];
+ List *thr_processed_commands;
+ List *thr_pending_commands[MSG_TYPE_COUNT];
+
+ GTM_Conn *thr_gtm_conn;
+
+} GTMProxy_ThreadInfo;
+
+typedef struct GTMProxy_Threads
+{
+ uint32 gt_thread_count;
+ uint32 gt_array_size;
+ uint32 gt_next_worker;
+ GTMProxy_ThreadInfo **gt_threads;
+ GTM_RWLock gt_lock;
+} GTMProxy_Threads;
+
+extern GTMProxy_Threads *GTMProxyThreads;
+
+int GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo);
+int GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo);
+int GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo);
+void GTMProxy_ThreadExit(void);
+
+extern GTMProxy_ThreadInfo *GTMProxy_ThreadCreate(void *(* startroutine)(void *));
+extern GTMProxy_ThreadInfo * GTMProxy_GetThreadInfo(GTM_ThreadID thrid);
+extern GTMProxy_ThreadInfo *GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo);
+extern int GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo,
+ GTMProxy_ConnectionInfo *conninfo);
+
+/*
+ * Command data - the only relevant information right now is the XID
+ */
+typedef union GTMProxy_CommandData
+{
+ struct
+ {
+ bool rdonly;
+ GTM_IsolationLevel iso_level;
+ } cd_beg;
+
+ struct
+ {
+ bool isgxid;
+ GlobalTransactionId gxid;
+ GTM_TransactionHandle handle;
+ } cd_rc;
+
+ struct
+ {
+ bool isgxid;
+ GlobalTransactionId gxid;
+ GTM_TransactionHandle handle;
+ } cd_snap;
+} GTMProxy_CommandData;
+
+/*
+ * Structures to be used for message proxing. There will be one such entry for
+ * each pending command from a backend. To keep it simple, we have a separate
+ * entry even if the commands are grouped together.
+ *
+ * An array of these entries is maintained which is sorted by the order in
+ * which the commands are sent to the GTM server. We expect the GTM server to
+ * respond back in the same order and the sorted array helps us in
+ * matching/confirming the responses.
+ */
+typedef struct GTMProxy_CommandInfo
+{
+ GTM_MessageType ci_mtype;
+ int ci_res_index;
+ GTMProxy_CommandData ci_data;
+ GTMProxy_ConnectionInfo *ci_conn;
+} GTMProxy_CommandInfo;
+
+/*
+ * pthread keys to get thread specific information
+ */
+extern pthread_key_t threadinfo_key;
+extern MemoryContext TopMostMemoryContext;
+extern char *GTMLogFile;
+
+#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo))
+#define GetMyThreadInfo ((GTMProxy_ThreadInfo *)pthread_getspecific(threadinfo_key))
+
+#define TopMemoryContext (GetMyThreadInfo->thr_thread_context)
+#define ThreadTopContext (GetMyThreadInfo->thr_thread_context)
+#define MessageContext (GetMyThreadInfo->thr_message_context)
+#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context)
+#define ErrorContext (GetMyThreadInfo->thr_error_context)
+#define errordata (GetMyThreadInfo->thr_error_data)
+#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth)
+#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth)
+#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count)
+
+#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf)
+#define MyConnection (GetMyThreadInfo->thr_conn)
+#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \
+ GetMyThreadInfo->thr_conn->con_port : \
+ NULL)
+#define MyThreadID (GetMyThreadInfo->thr_id)
+
+#define START_CRIT_SECTION() (CritSectionCount++)
+
+#define END_CRIT_SECTION() \
+ do { \
+ Assert(CritSectionCount > 0); \
+ CritSectionCount--; \
+ } while(0)
+
+#endif
diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h
new file mode 100644
index 0000000000..6cb8cb3027
--- /dev/null
+++ b/src/include/gtm/gtm_seq.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_seq.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_SEQ_H
+#define GTM_SEQ_H
+
+#include "gtm/stringinfo.h"
+
+/* Global sequence related structures */
+
+typedef struct GTM_SeqInfo
+{
+ GTM_SequenceKey gs_key;
+ GTM_Sequence gs_value;
+ GTM_Sequence gs_init_value;
+ GTM_Sequence gs_increment_by;
+ GTM_Sequence gs_min_value;
+ GTM_Sequence gs_max_value;
+ bool gs_cycle;
+ bool gs_called;
+
+ int32 gs_ref_count;
+ int32 gs_state;
+ GTM_RWLock gs_lock;
+} GTM_SeqInfo;
+
+#define SEQ_STATE_ACTIVE 1
+#define SEQ_STATE_DELETED 2
+
+#define SEQ_IS_ASCENDING(s) ((s)->gs_increment_by > 0)
+#define SEQ_IS_CYCLE(s) ((s)->gs_cycle)
+#define SEQ_IS_CALLED(s) ((s)->gs_called)
+
+#define SEQ_DEF_MAX_SEQVAL_ASCEND 0x7ffffffffffffffeLL
+#define SEQ_DEF_MIN_SEQVAL_ASCEND 0x1
+
+#define SEQ_DEF_MAX_SEQVAL_DESCEND -0x1
+#define SEQ_DEF_MIN_SEQVAL_DESCEND -0x7ffffffffffffffeLL
+
+#define SEQ_MAX_REFCOUNT 1024
+
+/* SEQUENCE Management */
+void GTM_InitSeqManager(void);
+int GTM_SeqOpen(GTM_SequenceKey seqkey,
+ GTM_Sequence increment_by,
+ GTM_Sequence minval,
+ GTM_Sequence maxval,
+ GTM_Sequence startval,
+ bool cycle);
+int GTM_SeqClose(GTM_SequenceKey sqkey);
+GTM_Sequence GTM_SeqGetNext(GTM_SequenceKey seqkey);
+GTM_Sequence GTM_SeqGetCurrent(GTM_SequenceKey seqkey);
+int GTM_SeqReset(GTM_SequenceKey seqkey);
+
+
+void ProcessSequenceInitCommand(Port *myport, StringInfo message);
+void ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message);
+void ProcessSequenceGetNextCommand(Port *myport, StringInfo message);
+void ProcessSequenceResetCommand(Port *myport, StringInfo message);
+void ProcessSequenceCloseCommand(Port *myport, StringInfo message);
+
+void GTM_SaveSeqInfo(int ctlfd);
+void GTM_RestoreSeqInfo(int ctlfd);
+
+#endif
diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h
new file mode 100644
index 0000000000..2d789463f7
--- /dev/null
+++ b/src/include/gtm/gtm_txn.h
@@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_txn.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_TXN_H
+#define _GTM_TXN_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_list.h"
+#include "gtm/stringinfo.h"
+
+/* ----------------
+ * Special transaction ID values
+ *
+ * BootstrapGlobalTransactionId is the XID for "bootstrap" operations, and
+ * FrozenGlobalTransactionId is used for very old tuples. Both should
+ * always be considered valid.
+ *
+ * FirstNormalGlobalTransactionId is the first "normal" transaction id.
+ * Note: if you need to change it, you must change pg_class.h as well.
+ * ----------------
+ */
+#define BootstrapGlobalTransactionId ((GlobalTransactionId) 1)
+#define FrozenGlobalTransactionId ((GlobalTransactionId) 2)
+#define FirstNormalGlobalTransactionId ((GlobalTransactionId) 3)
+#define MaxGlobalTransactionId ((GlobalTransactionId) 0xFFFFFFFF)
+
+/* ----------------
+ * transaction ID manipulation macros
+ * ----------------
+ */
+#define GlobalTransactionIdIsNormal(xid) ((xid) >= FirstNormalGlobalTransactionId)
+#define GlobalTransactionIdEquals(id1, id2) ((id1) == (id2))
+#define GlobalTransactionIdStore(xid, dest) (*(dest) = (xid))
+#define StoreInvalidGlobalTransactionId(dest) (*(dest) = InvalidGlobalTransactionId)
+
+/* advance a transaction ID variable, handling wraparound correctly */
+#define GlobalTransactionIdAdvance(dest) \
+ do { \
+ (dest)++; \
+ if ((dest) < FirstNormalGlobalTransactionId) \
+ (dest) = FirstNormalGlobalTransactionId; \
+ } while(0)
+
+/* back up a transaction ID variable, handling wraparound correctly */
+#define GlobalTransactionIdRetreat(dest) \
+ do { \
+ (dest)--; \
+ } while ((dest) < FirstNormalGlobalTransactionId)
+
+typedef int XidStatus;
+
+#define TRANSACTION_STATUS_IN_PROGRESS 0x00
+#define TRANSACTION_STATUS_COMMITTED 0x01
+#define TRANSACTION_STATUS_ABORTED 0x02
+
+/*
+ * prototypes for functions in transam/transam.c
+ */
+extern bool GlobalTransactionIdDidCommit(GlobalTransactionId transactionId);
+extern bool GlobalTransactionIdDidAbort(GlobalTransactionId transactionId);
+extern void GlobalTransactionIdAbort(GlobalTransactionId transactionId);
+extern bool GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2);
+
+/* in transam/varsup.c */
+extern GlobalTransactionId GTM_GetGlobalTransactionId(GTM_TransactionHandle handle);
+extern GlobalTransactionId GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count);
+extern GlobalTransactionId ReadNewGlobalTransactionId(void);
+extern void SetGlobalTransactionIdLimit(GlobalTransactionId oldest_datfrozenxid);
+extern void SetNextGlobalTransactionId(GlobalTransactionId gxid);
+extern void GTM_SetShuttingDown(void);
+
+typedef enum GTM_States
+{
+ GTM_STARTING,
+ GTM_RUNNING,
+ GTM_SHUTTING_DOWN
+} GTM_States;
+
+/* Global transaction states at the GTM */
+typedef enum GTM_TransactionStates
+{
+ GTM_TXN_STARTING,
+ GTM_TXN_IN_PROGRESS,
+ GTM_TXN_PREPARE_IN_PROGRESS,
+ GTM_TXN_PREPARED,
+ GTM_TXN_COMMIT_IN_PROGRESS,
+ GTM_TXN_COMMITTED,
+ GTM_TXN_ABORT_IN_PROGRESS,
+ GTM_TXN_ABORTED
+} GTM_TransactionStates;
+
+typedef struct GTM_TransactionInfo
+{
+ GTM_TransactionHandle gti_handle;
+ GTM_ThreadID gti_thread_id;
+
+ bool gti_in_use;
+ GlobalTransactionId gti_gxid;
+ GTM_TransactionStates gti_state;
+ PGXC_NodeId gti_coordid;
+ GlobalTransactionId gti_xmin;
+ GTM_IsolationLevel gti_isolevel;
+ bool gti_readonly;
+ GTMProxy_ConnID gti_backend_id;
+ uint32 gti_nodecount;
+ PGXC_NodeId *gti_nodes;
+
+ GTM_SnapshotData gti_current_snapshot;
+ bool gti_snapshot_set;
+
+ GTM_RWLock gti_lock;
+ bool gti_vacuum;
+} GTM_TransactionInfo;
+
+#define GTM_MAX_2PC_NODES 16
+#define GTM_CheckTransactionHandle(x) ((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS)
+#define GTM_IsTransSerializable(x) ((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE)
+
+typedef struct GTM_Transactions
+{
+ uint32 gt_txn_count;
+ GTM_States gt_gtm_state;
+
+ GTM_RWLock gt_XidGenLock;
+
+ /*
+ * These fields are protected by XidGenLock
+ */
+ GlobalTransactionId gt_nextXid; /* next XID to assign */
+
+ GlobalTransactionId gt_oldestXid; /* cluster-wide minimum datfrozenxid */
+ GlobalTransactionId gt_xidVacLimit; /* start forcing autovacuums here */
+ GlobalTransactionId gt_xidWarnLimit; /* start complaining here */
+ GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */
+ GlobalTransactionId gt_xidWrapLimit; /* where the world ends */
+
+ /*
+ * These fields are protected by TransArrayLock.
+ */
+ GlobalTransactionId gt_latestCompletedXid; /* newest XID that has committed or
+ * aborted */
+
+ GlobalTransactionId gt_recent_global_xmin;
+
+ int32 gt_lastslot;
+ GTM_TransactionInfo gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS];
+ List *gt_open_transactions;
+
+ GTM_RWLock gt_TransArrayLock;
+} GTM_Transactions;
+
+extern GTM_Transactions GTMTransactions;
+
+#define GTM_CountOpenTransactions() (list_length(GTMTransactions.gt_open_transactions))
+
+/*
+ * Two hash tables will be maintained to quickly find the
+ * GTM_TransactionInfo block given either the GXID or the GTM_TransactionHandle.
+ */
+
+GTM_TransactionInfo *GTM_HandleToTransactionInfo(GTM_TransactionHandle handle);
+GTM_TransactionHandle GTM_GXIDToHandle(GlobalTransactionId gxid);
+
+/* Transaction Control */
+void GTM_InitTxnManager(void);
+GTM_TransactionHandle GTM_BeginTransaction(GTM_CoordinatorId coord_id,
+ GTM_IsolationLevel isolevel,
+ bool readonly);
+int GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id,
+ GTM_IsolationLevel isolevel[],
+ bool readonly[],
+ GTMProxy_ConnID connid[],
+ int txn_count,
+ GTM_TransactionHandle txns[]);
+int GTM_RollbackTransaction(GTM_TransactionHandle txn);
+int GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]);
+int GTM_RollbackTransactionGXID(GlobalTransactionId gxid);
+int GTM_CommitTransaction(GTM_TransactionHandle txn);
+int GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]);
+int GTM_CommitTransactionGXID(GlobalTransactionId gxid);
+int GTM_PrepareTransaction(GTM_TransactionHandle txn,
+ uint32 nodecnt,
+ PGXC_NodeId nodes[]);
+int GTM_PrepareTransactionGXID(GlobalTransactionId gxid,
+ uint32 nodecnt,
+ PGXC_NodeId nodes[]);
+uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt);
+GTM_TransactionStates GTM_GetStatus(GTM_TransactionHandle txn);
+GTM_TransactionStates GTM_GetStatusGXID(GlobalTransactionId gxid);
+int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt);
+void GTM_RemoveAllTransInfos(int backend_id);
+
+GTM_Snapshot GTM_GetSnapshotData(GTM_TransactionInfo *my_txninfo,
+ GTM_Snapshot snapshot);
+GTM_Snapshot GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[],
+ int txn_count, int *status);
+void GTM_FreeCachedTransInfo(void);
+
+void ProcessBeginTransactionCommand(Port *myport, StringInfo message);
+void ProcessBeginTransactionCommandMulti(Port *myport, StringInfo message);
+void ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message);
+void ProcessCommitTransactionCommand(Port *myport, StringInfo message);
+void ProcessRollbackTransactionCommand(Port *myport, StringInfo message);
+void ProcessPrepareTransactionCommand(Port *myport, StringInfo message);
+void ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message);
+
+void ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message);
+void ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message);
+void ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message);
+void ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) ;
+
+void GTM_SaveTxnInfo(int ctlfd);
+void GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid);
+
+/*
+ * In gtm_snap.c
+ */
+void ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid);
+void ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message);
+void GTM_FreeSnapshotData(GTM_Snapshot snapshot);
+#endif
diff --git a/src/include/gtm/ip.h b/src/include/gtm/ip.h
new file mode 100644
index 0000000000..c5d975298b
--- /dev/null
+++ b/src/include/gtm/ip.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.h
+ * Definitions for IPv6-aware network access.
+ *
+ * These definitions are used by both frontend and backend code. Be careful
+ * what you include here!
+ *
+ * Copyright (c) 2003-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IP_H
+#define IP_H
+
+#include "gtm/pqcomm.h"
+
+
+extern int pg_getaddrinfo_all(const char *hostname, const char *servname,
+ const struct addrinfo * hintp,
+ struct addrinfo ** result);
+extern void pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai);
+
+extern int pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+ char *node, int nodelen,
+ char *service, int servicelen,
+ int flags);
+
+extern int pg_range_sockaddr(const struct sockaddr_storage * addr,
+ const struct sockaddr_storage * netaddr,
+ const struct sockaddr_storage * netmask);
+
+extern int pg_sockaddr_cidr_mask(struct sockaddr_storage * mask,
+ char *numbits, int family);
+
+#ifdef HAVE_IPV6
+extern void pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr);
+extern void pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr);
+#endif
+
+#ifdef HAVE_UNIX_SOCKETS
+#define IS_AF_UNIX(fam) ((fam) == AF_UNIX)
+#else
+#define IS_AF_UNIX(fam) (0)
+#endif
+
+#endif /* IP_H */
diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h
new file mode 100644
index 0000000000..0a795def67
--- /dev/null
+++ b/src/include/gtm/libpq-be.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq_be.h
+ * This file contains definitions for structures and externs used
+ * by the postmaster during client authentication.
+ *
+ * Note that this is backend-internal and is NOT exported to clients.
+ * Structs that need to be client-visible are in pqcomm.h.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/libpq-be.h,v 1.69 2009/01/01 17:23:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LIBPQ_BE_H
+#define LIBPQ_BE_H
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+
+#include "gtm/pqcomm.h"
+
+/*
+ * This is used by the postmaster in its communication with frontends. It
+ * contains all state information needed during this communication before the
+ * backend is run. The Port structure is kept in malloc'd memory and is
+ * still available when a backend is running (see MyProcPort). The data
+ * it points to must also be malloc'd, or else palloc'd in TopMostMemoryContext,
+ * so that it survives into GTM_ThreadMain execution!
+ */
+
+typedef struct Port
+{
+ int sock; /* File descriptor */
+ SockAddr laddr; /* local addr (postmaster) */
+ SockAddr raddr; /* remote addr (client) */
+ char *remote_host; /* name (or ip addr) of remote host */
+ char *remote_port; /* text rep of remote port */
+
+ GTMProxy_ConnID conn_id; /* RequestID of this command */
+
+ GTM_CoordinatorId coordinator_id; /* Coordinator ID */
+ bool is_proxy; /* Is this a connection from GTM proxy ? */
+#define PQ_BUFFER_SIZE 8192
+
+ char PqSendBuffer[PQ_BUFFER_SIZE];
+ int PqSendPointer; /* Next index to store a byte in PqSendBuffer */
+
+ char PqRecvBuffer[PQ_BUFFER_SIZE];
+ int PqRecvPointer; /* Next index to read a byte from PqRecvBuffer */
+ int PqRecvLength; /* End of data available in PqRecvBuffer */
+
+ /*
+ * TCP keepalive settings.
+ *
+ * default values are 0 if AF_UNIX or not yet known; current values are 0
+ * if AF_UNIX or using the default. Also, -1 in a default value means we
+ * were unable to find out the default (getsockopt failed).
+ */
+ int default_keepalives_idle;
+ int default_keepalives_interval;
+ int default_keepalives_count;
+ int keepalives_idle;
+ int keepalives_interval;
+ int keepalives_count;
+} Port;
+
+/* TCP keepalives configuration. These are no-ops on an AF_UNIX socket. */
+
+extern int pq_getkeepalivesidle(Port *port);
+extern int pq_getkeepalivesinterval(Port *port);
+extern int pq_getkeepalivescount(Port *port);
+
+extern int pq_setkeepalivesidle(int idle, Port *port);
+extern int pq_setkeepalivesinterval(int interval, Port *port);
+extern int pq_setkeepalivescount(int count, Port *port);
+
+#endif /* LIBPQ_BE_H */
diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h
new file mode 100644
index 0000000000..2c5c2c4e04
--- /dev/null
+++ b/src/include/gtm/libpq-fe.h
@@ -0,0 +1,138 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq-fe.h
+ * This file contains definitions for structures and
+ * externs for functions used by frontend postgres applications.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-fe.h,v 1.145 2009/01/01 17:24:03 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef LIBPQ_FE_H
+#define LIBPQ_FE_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stdio.h>
+
+/*
+ * postgres_ext.h defines the backend's externally visible types,
+ * such as Oid.
+ */
+#include "gtm/gtm_ext.h"
+
+/*
+ * Option flags for PQcopyResult
+ */
+#define PG_COPYRES_ATTRS 0x01
+#define PG_COPYRES_TUPLES 0x02 /* Implies PG_COPYRES_ATTRS */
+#define PG_COPYRES_EVENTS 0x04
+#define PG_COPYRES_NOTICEHOOKS 0x08
+
+/* Application-visible enum types */
+
+typedef enum
+{
+ /*
+ * Although it is okay to add to this list, values which become unused
+ * should never be removed, nor should constants be redefined - that would
+ * break compatibility with existing code.
+ */
+ CONNECTION_OK,
+ CONNECTION_BAD,
+ /* Non-blocking mode only below here */
+
+ /*
+ * The existence of these should never be relied upon - they should only
+ * be used for user feedback or similar purposes.
+ */
+ CONNECTION_STARTED, /* Waiting for connection to be made. */
+ CONNECTION_MADE, /* Connection OK; waiting to send. */
+ CONNECTION_AWAITING_RESPONSE, /* Waiting for a response from the
+ * postmaster. */
+ CONNECTION_AUTH_OK, /* Received authentication; waiting for
+ * backend startup. */
+ CONNECTION_SETENV, /* Negotiating environment. */
+ CONNECTION_SSL_STARTUP, /* Negotiating SSL. */
+ CONNECTION_NEEDED /* Internal state: connect() needed */
+} ConnStatusType;
+
+typedef enum
+{
+ PGRES_POLLING_FAILED = 0,
+ PGRES_POLLING_READING, /* These two indicate that one may */
+ PGRES_POLLING_WRITING, /* use select before polling again. */
+ PGRES_POLLING_OK,
+ PGRES_POLLING_ACTIVE /* unused; keep for awhile for backwards
+ * compatibility */
+} GTMClientPollingStatusType;
+
+/* ----------------
+ * Structure for the conninfo parameter definitions returned by PQconndefaults
+ * or GTMPQconninfoParse.
+ *
+ * All fields except "val" point at static strings which must not be altered.
+ * "val" is either NULL or a malloc'd current-value string. GTMPQconninfoFree()
+ * will release both the val strings and the GTMPQconninfoOption array itself.
+ * ----------------
+ */
+typedef struct _GTMPQconninfoOption
+{
+ char *keyword; /* The keyword of the option */
+ char *val; /* Option's current value, or NULL */
+} GTMPQconninfoOption;
+
+typedef struct gtm_conn GTM_Conn;
+
+/* ----------------
+ * Exported functions of libpq
+ * ----------------
+ */
+
+/* === in fe-connect.c === */
+
+/* make a new client connection to the backend */
+/* Asynchronous (non-blocking) */
+extern GTM_Conn *PQconnectGTMStart(const char *conninfo);
+extern GTMClientPollingStatusType GTMPQconnectPoll(GTM_Conn *conn);
+
+/* Synchronous (blocking) */
+extern GTM_Conn *PQconnectGTM(const char *conninfo);
+
+/* close the current connection and free the GTM_Conn data structure */
+extern void GTMPQfinish(GTM_Conn *conn);
+
+/* parse connection options in same way as PQconnectGTM */
+extern GTMPQconninfoOption *GTMPQconninfoParse(const char *conninfo, char **errmsg);
+
+/* free the data structure returned by PQconndefaults() or GTMPQconninfoParse() */
+extern void GTMPQconninfoFree(GTMPQconninfoOption *connOptions);
+
+extern char *GTMPQhost(const GTM_Conn *conn);
+extern char *GTMPQport(const GTM_Conn *conn);
+extern ConnStatusType GTMPQstatus(const GTM_Conn *conn);
+extern char *GTMPQerrorMessage(const GTM_Conn *conn);
+extern int GTMPQsocket(const GTM_Conn *conn);
+
+/* Enable/disable tracing */
+extern void GTMPQtrace(GTM_Conn *conn, FILE *debug_port);
+extern void GTMPQuntrace(GTM_Conn *conn);
+
+/* Force the write buffer to be written (or at least try) */
+extern int PQflush(GTM_Conn *conn);
+
+#define libpq_gettext(x) x
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBPQ_FE_H */
diff --git a/src/include/gtm/libpq-int.h b/src/include/gtm/libpq-int.h
new file mode 100644
index 0000000000..5956de8ff2
--- /dev/null
+++ b/src/include/gtm/libpq-int.h
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq-int.h
+ * This file contains internal definitions meant to be used only by
+ * the frontend libpq library, not by applications that call it.
+ *
+ * An application can include this file if it wants to bypass the
+ * official API defined by libpq-fe.h, but code that does so is much
+ * more likely to break across PostgreSQL releases than code that uses
+ * only the official API.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-int.h,v 1.139 2009/01/01 17:24:03 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef LIBPQ_INT_H
+#define LIBPQ_INT_H
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include "gtm/pqcomm.h"
+#include "gtm/pqexpbuffer.h"
+#include "gtm/gtm_client.h"
+
+/*
+ * GTM_Conn stores all the state data associated with a single connection
+ * to a backend.
+ */
+struct gtm_conn
+{
+ /* Saved values of connection options */
+ char *pghost; /* the machine on which the server is running */
+ char *pghostaddr; /* the IPv4 address of the machine on which
+ * the server is running, in IPv4
+ * numbers-and-dots notation. Takes precedence
+ * over above. */
+ char *pgport; /* the server's communication port */
+ char *connect_timeout; /* connection timeout (numeric string) */
+ char *coordinator_id; /* coordinator id */
+ int is_proxy; /* is this a connection to/from a proxy ? */
+
+ /* Optional file to write trace info to */
+ FILE *Pfdebug;
+
+ /* Status indicators */
+ ConnStatusType status;
+
+ /* Connection data */
+ int sock; /* Unix FD for socket, -1 if not connected */
+ SockAddr laddr; /* Local address */
+ SockAddr raddr; /* Remote address */
+
+ /* Transient state needed while establishing connection */
+ struct addrinfo *addrlist; /* list of possible backend addresses */
+ struct addrinfo *addr_cur; /* the one currently being tried */
+ int addrlist_family; /* needed to know how to free addrlist */
+
+ /* Buffer for data received from backend and not yet processed */
+ char *inBuffer; /* currently allocated buffer */
+ int inBufSize; /* allocated size of buffer */
+ int inStart; /* offset to first unconsumed data in buffer */
+ int inCursor; /* next byte to tentatively consume */
+ int inEnd; /* offset to first position after avail data */
+
+ /* Buffer for data not yet sent to backend */
+ char *outBuffer; /* currently allocated buffer */
+ int outBufSize; /* allocated size of buffer */
+ int outCount; /* number of chars waiting in buffer */
+
+ /* State for constructing messages in outBuffer */
+ int outMsgStart; /* offset to msg start (length word); if -1,
+ * msg has no length word */
+ int outMsgEnd; /* offset to msg end (so far) */
+
+ /* Buffer for current error message */
+ PQExpBufferData errorMessage; /* expansible string */
+
+ /* Buffer for receiving various parts of messages */
+ PQExpBufferData workBuffer; /* expansible string */
+
+ /* Pointer to the result of last operation */
+ GTM_Result *result;
+};
+
+/* === in fe-misc.c === */
+
+ /*
+ * "Get" and "Put" routines return 0 if successful, EOF if not. Note that for
+ * Get, EOF merely means the buffer is exhausted, not that there is
+ * necessarily any error.
+ */
+extern int gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn);
+extern int gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn);
+extern int gtmpqGetc(char *result, GTM_Conn *conn);
+extern int gtmpqPutc(char c, GTM_Conn *conn);
+extern int gtmpqGets(PQExpBuffer buf, GTM_Conn *conn);
+extern int gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn);
+extern int gtmpqPuts(const char *s, GTM_Conn *conn);
+extern int gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn);
+extern int gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn);
+extern int gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn);
+extern int gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn);
+extern int gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn);
+extern int gtmpqPutMsgEnd(GTM_Conn *conn);
+extern int gtmpqReadData(GTM_Conn *conn);
+extern int gtmpqFlush(GTM_Conn *conn);
+extern int gtmpqWait(int forRead, int forWrite, GTM_Conn *conn);
+extern int gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn,
+ time_t finish_time);
+extern int gtmpqReadReady(GTM_Conn *conn);
+extern int gtmpqWriteReady(GTM_Conn *conn);
+
+/*
+ * In fe-protocol.c
+ */
+GTM_Result * GTMPQgetResult(GTM_Conn *conn);
+extern int gtmpqGetError(GTM_Conn *conn, GTM_Result *result);
+void gtmpqFreeResultData(GTM_Result *result, bool is_proxy);
+
+#define SOCK_ERRNO errno
+#define SOCK_ERRNO_SET(e) (errno = (e))
+
+#endif /* LIBPQ_INT_H */
diff --git a/src/include/gtm/libpq.h b/src/include/gtm/libpq.h
new file mode 100644
index 0000000000..29621a43c4
--- /dev/null
+++ b/src/include/gtm/libpq.h
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq.h
+ * POSTGRES LIBPQ buffer structure definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/libpq.h,v 1.70 2008/11/20 09:29:36 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LIBPQ_H
+#define LIBPQ_H
+
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "gtm/stringinfo.h"
+#include "gtm/libpq-be.h"
+
+/*
+ * External functions.
+ */
+
+/*
+ * prototypes for functions in pqcomm.c
+ */
+extern int StreamServerPort(int family, char *hostName,
+ unsigned short portNumber, int ListenSocket[],
+ int MaxListen);
+extern int StreamConnection(int server_fd, Port *port);
+extern void StreamClose(int sock);
+extern void TouchSocketFile(void);
+extern void pq_comm_reset(void);
+extern int pq_getbytes(Port *myport, char *s, size_t len);
+extern int pq_getstring(Port *myport, StringInfo s);
+extern int pq_getmessage(Port *myport, StringInfo s, int maxlen);
+extern int pq_getbyte(Port *myport);
+extern int pq_peekbyte(Port *myport);
+extern int pq_putbytes(Port *myport, const char *s, size_t len);
+extern int pq_flush(Port *myport);
+extern int pq_putmessage(Port *myport, char msgtype, const char *s, size_t len);
+
+#endif /* LIBPQ_H */
diff --git a/src/include/gtm/memnodes.h b/src/include/gtm/memnodes.h
new file mode 100644
index 0000000000..dea51b2bbd
--- /dev/null
+++ b/src/include/gtm/memnodes.h
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * memnodes.h
+ * POSTGRES memory context node definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/nodes/memnodes.h,v 1.34 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MEMNODES_H
+#define MEMNODES_H
+
+#include "gtm/gtm_lock.h"
+
+/*
+ * MemoryContext
+ * A logical context in which memory allocations occur.
+ *
+ * MemoryContext itself is an abstract type that can have multiple
+ * implementations, though for now we have only AllocSetContext.
+ * The function pointers in MemoryContextMethods define one specific
+ * implementation of MemoryContext --- they are a virtual function table
+ * in C++ terms.
+ *
+ * Note: for largely historical reasons, typedef MemoryContext is a pointer
+ * to the context struct rather than the struct type itself.
+ */
+
+typedef struct MemoryContextMethods
+{
+ void *(*alloc) (MemoryContext context, Size size);
+ /* call this free_p in case someone #define's free() */
+ void (*free_p) (MemoryContext context, void *pointer);
+ void *(*realloc) (MemoryContext context, void *pointer, Size size);
+ void (*init) (MemoryContext context);
+ void (*reset) (MemoryContext context);
+ void (*delete) (MemoryContext context);
+ Size (*get_chunk_space) (MemoryContext context, void *pointer);
+ bool (*is_empty) (MemoryContext context);
+ void (*stats) (MemoryContext context, int level);
+#ifdef MEMORY_CONTEXT_CHECKING
+ void (*check) (MemoryContext context);
+#endif
+} MemoryContextMethods;
+
+
+typedef struct MemoryContextData
+{
+ MemoryContextMethods *methods; /* virtual function table */
+ MemoryContext parent; /* NULL if no parent (toplevel context) */
+ MemoryContext firstchild; /* head of linked list of children */
+ MemoryContext nextchild; /* next child of same parent */
+ char *name; /* context name (just for debugging) */
+ bool is_shared; /* context is shared by threads */
+ GTM_RWLock lock; /* lock to protect members if the context is shared */
+} MemoryContextData;
+
+#define MemoryContextIsShared(context) \
+ (((MemoryContextData *)(context))->is_shared)
+
+#define MemoryContextLock(context) \
+ (GTM_RWLockAcquire(&((MemoryContextData *)(context))->lock, GTM_LOCKMODE_WRITE))
+#define MemoryContextUnlock(context) \
+ (GTM_RWLockRelease(&((MemoryContextData *)(context))->lock))
+/*
+ * MemoryContextIsValid
+ * True iff memory context is valid.
+ *
+ * Add new context types to the set accepted by this macro.
+ */
+#define MemoryContextIsValid(context) \
+ ((context) != NULL)
+
+#endif /* MEMNODES_H */
diff --git a/src/include/gtm/memutils.h b/src/include/gtm/memutils.h
new file mode 100644
index 0000000000..5d89995d4d
--- /dev/null
+++ b/src/include/gtm/memutils.h
@@ -0,0 +1,123 @@
+/*-------------------------------------------------------------------------
+ *
+ * memutils.h
+ * This file contains declarations for memory allocation utility
+ * functions. These are functions that are not quite widely used
+ * enough to justify going in utils/palloc.h, but are still part
+ * of the API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/memutils.h,v 1.64 2008/01/01 19:45:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MEMUTILS_H
+#define MEMUTILS_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/memnodes.h"
+
+/*
+ * MaxAllocSize
+ * Quasi-arbitrary limit on size of allocations.
+ *
+ * Note:
+ * There is no guarantee that allocations smaller than MaxAllocSize
+ * will succeed. Allocation requests larger than MaxAllocSize will
+ * be summarily denied.
+ *
+ * XXX This is deliberately chosen to correspond to the limiting size
+ * of varlena objects under TOAST. See VARATT_MASK_SIZE in postgres.h.
+ *
+ * XXX Also, various places in aset.c assume they can compute twice an
+ * allocation's size without overflow, so beware of raising this.
+ */
+#define MaxAllocSize ((Size) 0x3fffffff) /* 1 gigabyte - 1 */
+
+#define AllocSizeIsValid(size) ((Size) (size) <= MaxAllocSize)
+
+/*
+ * All chunks allocated by any memory context manager are required to be
+ * preceded by a StandardChunkHeader at a spacing of STANDARDCHUNKHEADERSIZE.
+ * A currently-allocated chunk must contain a backpointer to its owning
+ * context as well as the allocated size of the chunk. The backpointer is
+ * used by pfree() and repalloc() to find the context to call. The allocated
+ * size is not absolutely essential, but it's expected to be needed by any
+ * reasonable implementation.
+ */
+typedef struct StandardChunkHeader
+{
+ MemoryContext context; /* owning context */
+ Size size; /* size of data space allocated in chunk */
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* when debugging memory usage, also store actual requested size */
+ Size requested_size;
+#endif
+} StandardChunkHeader;
+
+#define STANDARDCHUNKHEADERSIZE MAXALIGN(sizeof(StandardChunkHeader))
+
+/*
+ * Memory-context-type-independent functions in mcxt.c
+ */
+extern void MemoryContextInit(void);
+extern void MemoryContextReset(MemoryContext context);
+extern void MemoryContextDelete(MemoryContext context);
+extern void MemoryContextResetChildren(MemoryContext context);
+extern void MemoryContextDeleteChildren(MemoryContext context);
+extern void MemoryContextResetAndDeleteChildren(MemoryContext context);
+extern Size GetMemoryChunkSpace(void *pointer);
+extern MemoryContext GetMemoryChunkContext(void *pointer);
+extern bool MemoryContextIsEmpty(MemoryContext context);
+extern void MemoryContextStats(MemoryContext context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+extern void MemoryContextCheck(MemoryContext context);
+#endif
+extern bool MemoryContextContains(MemoryContext context, void *pointer);
+
+/*
+ * This routine handles the context-type-independent part of memory
+ * context creation. It's intended to be called from context-type-
+ * specific creation routines, and noplace else.
+ */
+extern MemoryContext MemoryContextCreate(Size size,
+ MemoryContextMethods *methods,
+ MemoryContext parent,
+ const char *name);
+
+
+/*
+ * Memory-context-type-specific functions
+ */
+
+/* aset.c */
+extern MemoryContext AllocSetContextCreate(MemoryContext parent,
+ const char *name,
+ Size minContextSize,
+ Size initBlockSize,
+ Size maxBlockSize,
+ bool isShared);
+
+/*
+ * Recommended default alloc parameters, suitable for "ordinary" contexts
+ * that might hold quite a lot of data.
+ */
+#define ALLOCSET_DEFAULT_MINSIZE 0
+#define ALLOCSET_DEFAULT_INITSIZE (8 * 1024)
+#define ALLOCSET_DEFAULT_MAXSIZE (8 * 1024 * 1024)
+
+/*
+ * Recommended alloc parameters for "small" contexts that are not expected
+ * to contain much data (for example, a context to contain a query plan).
+ */
+#define ALLOCSET_SMALL_MINSIZE 0
+#define ALLOCSET_SMALL_INITSIZE (1 * 1024)
+#define ALLOCSET_SMALL_MAXSIZE (8 * 1024)
+
+#endif /* MEMUTILS_H */
diff --git a/src/include/gtm/palloc.h b/src/include/gtm/palloc.h
new file mode 100644
index 0000000000..380e280694
--- /dev/null
+++ b/src/include/gtm/palloc.h
@@ -0,0 +1,90 @@
+/*-------------------------------------------------------------------------
+ *
+ * palloc.h
+ * POSTGRES memory allocator definitions.
+ *
+ * This file contains the basic memory allocation interface that is
+ * needed by almost every backend module. It is included directly by
+ * postgres.h, so the definitions here are automatically available
+ * everywhere. Keep it lean!
+ *
+ * Memory allocation occurs within "contexts". Every chunk obtained from
+ * palloc()/MemoryContextAlloc() is allocated within a specific context.
+ * The entire contents of a context can be freed easily and quickly by
+ * resetting or deleting the context --- this is both faster and less
+ * prone to memory-leakage bugs than releasing chunks individually.
+ * We organize contexts into context trees to allow fine-grain control
+ * over chunk lifetime while preserving the certainty that we will free
+ * everything that should be freed. See utils/mmgr/README for more info.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/palloc.h,v 1.40 2008/06/28 16:45:22 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PALLOC_H
+#define PALLOC_H
+
+/*
+ * Type MemoryContextData is declared in nodes/memnodes.h. Most users
+ * of memory allocation should just treat it as an abstract type, so we
+ * do not provide the struct contents here.
+ */
+typedef struct MemoryContextData *MemoryContext;
+
+/*
+ * Fundamental memory-allocation operations (more are in utils/memutils.h)
+ */
+extern void *MemoryContextAlloc(MemoryContext context, Size size);
+extern void *MemoryContextAllocZero(MemoryContext context, Size size);
+extern void *MemoryContextAllocZeroAligned(MemoryContext context, Size size);
+
+#define palloc(sz) MemoryContextAlloc(CurrentMemoryContext, (sz))
+
+#define palloc0(sz) MemoryContextAllocZero(CurrentMemoryContext, (sz))
+
+/*
+ * The result of palloc() is always word-aligned, so we can skip testing
+ * alignment of the pointer when deciding which MemSet variant to use.
+ * Note that this variant does not offer any advantage, and should not be
+ * used, unless its "sz" argument is a compile-time constant; therefore, the
+ * issue that it evaluates the argument multiple times isn't a problem in
+ * practice.
+ */
+#define palloc0fast(sz) \
+ ( MemSetTest(0, sz) ? \
+ MemoryContextAllocZeroAligned(CurrentMemoryContext, sz) : \
+ MemoryContextAllocZero(CurrentMemoryContext, sz) )
+
+extern void pfree(void *pointer);
+
+extern void *repalloc(void *pointer, Size size);
+
+/*
+ * MemoryContextSwitchTo can't be a macro in standard C compilers.
+ * But we can make it an inline function when using GCC.
+ */
+
+extern MemoryContext MemoryContextSwitchTo(MemoryContext context);
+
+/*
+ * These are like standard strdup() except the copied string is
+ * allocated in a context, not with malloc().
+ */
+extern char *MemoryContextStrdup(MemoryContext context, const char *string);
+
+#define pstrdup(str) MemoryContextStrdup(CurrentMemoryContext, (str))
+
+extern char *pnstrdup(const char *in, Size len);
+
+#if defined(WIN32) || defined(__CYGWIN__)
+extern void *pgport_palloc(Size sz);
+extern char *pgport_pstrdup(const char *str);
+extern void pgport_pfree(void *pointer);
+#endif
+
+#endif /* PALLOC_H */
diff --git a/src/include/gtm/path.h b/src/include/gtm/path.h
new file mode 100644
index 0000000000..624fd183c9
--- /dev/null
+++ b/src/include/gtm/path.h
@@ -0,0 +1,16 @@
+/*-------------------------------------------------------------------------
+ *
+ * path.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+
+extern void canonicalize_path(char *path);
diff --git a/src/include/gtm/pqcomm.h b/src/include/gtm/pqcomm.h
new file mode 100644
index 0000000000..cdae6ca284
--- /dev/null
+++ b/src/include/gtm/pqcomm.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqcomm.h
+ * Definitions common to frontends and backends.
+ *
+ * NOTE: for historical reasons, this does not correspond to pqcomm.c.
+ * pqcomm.c's routines are declared in libpq.h.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqcomm.h,v 1.109 2008/10/28 12:10:44 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQCOMM_H
+#define PQCOMM_H
+
+#include <sys/socket.h>
+#include <netdb.h>
+#ifdef HAVE_SYS_UN_H
+#include <sys/un.h>
+#endif
+#include <netinet/in.h>
+
+typedef struct
+{
+ struct sockaddr_storage addr;
+ size_t salen;
+} SockAddr;
+
+/* Configure the UNIX socket location for the well known port. */
+
+#define UNIXSOCK_PATH(path, port, sockdir) \
+ snprintf(path, sizeof(path), "%s/.s.PGSQL.%d", \
+ ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
+ DEFAULT_PGSOCKET_DIR, \
+ (port))
+
+/*
+ * Packet lengths are 4 bytes in network byte order.
+ *
+ * The initial length is omitted from the packet layouts appearing below.
+ */
+
+typedef uint32 PacketLen;
+
+/*
+ * In protocol 3.0 and later, the startup packet length is not fixed, but
+ * we set an arbitrary limit on it anyway. This is just to prevent simple
+ * denial-of-service attacks via sending enough data to run the server
+ * out of memory.
+ */
+#define MAX_STARTUP_PACKET_LENGTH 10000
+
+#endif /* PQCOMM_H */
diff --git a/src/include/gtm/pqexpbuffer.h b/src/include/gtm/pqexpbuffer.h
new file mode 100644
index 0000000000..7ae0411423
--- /dev/null
+++ b/src/include/gtm/pqexpbuffer.h
@@ -0,0 +1,181 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqexpbuffer.h
+ * Declarations/definitions for "PQExpBuffer" functions.
+ *
+ * PQExpBuffer provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data. All storage is allocated with malloc().
+ *
+ * This module is essentially the same as the backend's StringInfo data type,
+ * but it is intended for use in frontend libpq and client applications.
+ * Thus, it does not rely on palloc() nor elog().
+ *
+ * It does rely on vsnprintf(); if configure finds that libc doesn't provide
+ * a usable vsnprintf(), then a copy of our own implementation of it will
+ * be linked into libpq.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.h,v 1.21 2008/11/26 16:23:11 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQEXPBUFFER_H
+#define PQEXPBUFFER_H
+
+/*-------------------------
+ * PQExpBufferData holds information about an extensible string.
+ * data is the current buffer for the string (allocated with malloc).
+ * len is the current string length. There is guaranteed to be
+ * a terminating '\0' at data[len], although this is not very
+ * useful when the string holds binary data rather than text.
+ * maxlen is the allocated size in bytes of 'data', i.e. the maximum
+ * string size (including the terminating '\0' char) that we can
+ * currently store in 'data' without having to reallocate
+ * more space. We must always have maxlen > len.
+ *
+ * An exception occurs if we failed to allocate enough memory for the string
+ * buffer. In that case data points to a statically allocated empty string,
+ * and len = maxlen = 0.
+ *-------------------------
+ */
+typedef struct PQExpBufferData
+{
+ char *data;
+ size_t len;
+ size_t maxlen;
+} PQExpBufferData;
+
+typedef PQExpBufferData *PQExpBuffer;
+
+/*------------------------
+ * Test for a broken (out of memory) PQExpBuffer.
+ * When a buffer is "broken", all operations except resetting or deleting it
+ * are no-ops.
+ *------------------------
+ */
+#define PQExpBufferBroken(str) \
+ ((str) == NULL || (str)->maxlen == 0)
+
+/*------------------------
+ * Initial size of the data buffer in a PQExpBuffer.
+ * NB: this must be large enough to hold error messages that might
+ * be returned by PQrequestCancel().
+ *------------------------
+ */
+#define INITIAL_EXPBUFFER_SIZE 256
+
+/*------------------------
+ * There are two ways to create a PQExpBuffer object initially:
+ *
+ * PQExpBuffer stringptr = createGTMPQExpBuffer();
+ * Both the PQExpBufferData and the data buffer are malloc'd.
+ *
+ * PQExpBufferData string;
+ * initGTMPQExpBuffer(&string);
+ * The data buffer is malloc'd but the PQExpBufferData is presupplied.
+ * This is appropriate if the PQExpBufferData is a field of another
+ * struct.
+ *-------------------------
+ */
+
+/*------------------------
+ * createGTMPQExpBuffer
+ * Create an empty 'PQExpBufferData' & return a pointer to it.
+ */
+extern PQExpBuffer createGTMPQExpBuffer(void);
+
+/*------------------------
+ * initGTMPQExpBuffer
+ * Initialize a PQExpBufferData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+extern void initGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * To destroy a PQExpBuffer, use either:
+ *
+ * destroyGTMPQExpBuffer(str);
+ * free()s both the data buffer and the PQExpBufferData.
+ * This is the inverse of createGTMPQExpBuffer().
+ *
+ * termGTMPQExpBuffer(str)
+ * free()s the data buffer but not the PQExpBufferData itself.
+ * This is the inverse of initGTMPQExpBuffer().
+ *
+ * NOTE: some routines build up a string using PQExpBuffer, and then
+ * release the PQExpBufferData but return the data string itself to their
+ * caller. At that point the data string looks like a plain malloc'd
+ * string.
+ */
+extern void destroyGTMPQExpBuffer(PQExpBuffer str);
+extern void termGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * resetGTMPQExpBuffer
+ * Reset a PQExpBuffer to empty
+ *
+ * Note: if possible, a "broken" PQExpBuffer is returned to normal.
+ */
+extern void resetGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * enlargeGTMPQExpBuffer
+ * Make sure there is enough space for 'needed' more bytes in the buffer
+ * ('needed' does not include the terminating null).
+ *
+ * Returns 1 if OK, 0 if failed to enlarge buffer. (In the latter case
+ * the buffer is left in "broken" state.)
+ */
+extern int enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed);
+
+/*------------------------
+ * printfGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and insert it into str. More space is allocated to str if necessary.
+ * This is a convenience routine that does the same thing as
+ * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer().
+ */
+extern void
+printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and append it to whatever is already in str. More space is allocated
+ * to str if necessary. This is sort of like a combination of sprintf and
+ * strcat.
+ */
+extern void
+appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendGTMPQExpBufferStr
+ * Append the given string to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+extern void appendGTMPQExpBufferStr(PQExpBuffer str, const char *data);
+
+/*------------------------
+ * appendGTMPQExpBufferChar
+ * Append a single byte to str.
+ * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster.
+ */
+extern void appendGTMPQExpBufferChar(PQExpBuffer str, char ch);
+
+/*------------------------
+ * appendBinaryGTMPQExpBuffer
+ * Append arbitrary binary data to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+extern void appendBinaryGTMPQExpBuffer(PQExpBuffer str,
+ const char *data, size_t datalen);
+
+#endif /* PQEXPBUFFER_H */
diff --git a/src/include/gtm/pqformat.h b/src/include/gtm/pqformat.h
new file mode 100644
index 0000000000..3febf2cf2e
--- /dev/null
+++ b/src/include/gtm/pqformat.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqformat.h
+ * Definitions for formatting and parsing frontend/backend messages
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqformat.h,v 1.27 2009/01/01 17:23:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQFORMAT_H
+#define PQFORMAT_H
+
+#include "gtm/stringinfo.h"
+
+extern void pq_beginmessage(StringInfo buf, char msgtype);
+extern void pq_sendbyte(StringInfo buf, int byt);
+extern void pq_sendbytes(StringInfo buf, const char *data, int datalen);
+extern void pq_sendcountedtext(StringInfo buf, const char *str, int slen,
+ bool countincludesself);
+extern void pq_sendtext(StringInfo buf, const char *str, int slen);
+extern void pq_sendstring(StringInfo buf, const char *str);
+extern void pq_send_ascii_string(StringInfo buf, const char *str);
+extern void pq_sendint(StringInfo buf, int i, int b);
+extern void pq_sendint64(StringInfo buf, int64 i);
+extern void pq_sendfloat4(StringInfo buf, float4 f);
+extern void pq_sendfloat8(StringInfo buf, float8 f);
+extern void pq_endmessage(Port *myport, StringInfo buf);
+
+extern void pq_puttextmessage(Port *myport, char msgtype, const char *str);
+extern void pq_putemptymessage(Port *myport, char msgtype);
+
+extern int pq_getmsgbyte(StringInfo msg);
+extern unsigned int pq_getmsgint(StringInfo msg, int b);
+extern int64 pq_getmsgint64(StringInfo msg);
+extern float4 pq_getmsgfloat4(StringInfo msg);
+extern float8 pq_getmsgfloat8(StringInfo msg);
+extern const char *pq_getmsgbytes(StringInfo msg, int datalen);
+extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen);
+extern char *pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes);
+extern const char *pq_getmsgstring(StringInfo msg);
+extern void pq_getmsgend(StringInfo msg);
+extern int pq_getmsgunreadlen(StringInfo msg);
+
+#endif /* PQFORMAT_H */
diff --git a/src/include/gtm/pqsignal.h b/src/include/gtm/pqsignal.h
new file mode 100644
index 0000000000..e3a53dc3ed
--- /dev/null
+++ b/src/include/gtm/pqsignal.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqsignal.h
+ * prototypes for the reliable BSD-style signal(2) routine.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqsignal.h,v 1.32 2008/01/01 19:45:58 momjian Exp $
+ *
+ * NOTES
+ * This shouldn't be in libpq, but the monitor and some other
+ * things need it...
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQSIGNAL_H
+#define PQSIGNAL_H
+
+#include <signal.h>
+
+#ifdef HAVE_SIGPROCMASK
+extern sigset_t UnBlockSig,
+ BlockSig,
+ AuthBlockSig;
+
+#define PG_SETMASK(mask) sigprocmask(SIG_SETMASK, mask, NULL)
+#else
+extern int UnBlockSig,
+ BlockSig,
+ AuthBlockSig;
+
+#ifndef WIN32
+#define PG_SETMASK(mask) sigsetmask(*((int*)(mask)))
+#else
+#define PG_SETMASK(mask) pqsigsetmask(*((int*)(mask)))
+int pqsigsetmask(int mask);
+#endif
+#endif
+
+typedef void (*pqsigfunc) (int);
+
+extern void pqinitmask(void);
+
+extern pqsigfunc pqsignal(int signo, pqsigfunc func);
+
+#endif /* PQSIGNAL_H */
diff --git a/src/include/gtm/stringinfo.h b/src/include/gtm/stringinfo.h
new file mode 100644
index 0000000000..197aa877a1
--- /dev/null
+++ b/src/include/gtm/stringinfo.h
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * stringinfo.h
+ * Declarations/definitions for "StringInfo" functions.
+ *
+ * StringInfo provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data. All storage is allocated with palloc().
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/lib/stringinfo.h,v 1.35 2008/01/01 19:45:57 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STRINGINFO_H
+#define STRINGINFO_H
+
+/*-------------------------
+ * StringInfoData holds information about an extensible string.
+ * data is the current buffer for the string (allocated with palloc).
+ * len is the current string length. There is guaranteed to be
+ * a terminating '\0' at data[len], although this is not very
+ * useful when the string holds binary data rather than text.
+ * maxlen is the allocated size in bytes of 'data', i.e. the maximum
+ * string size (including the terminating '\0' char) that we can
+ * currently store in 'data' without having to reallocate
+ * more space. We must always have maxlen > len.
+ * cursor is initialized to zero by makeStringInfo or initStringInfo,
+ * but is not otherwise touched by the stringinfo.c routines.
+ * Some routines use it to scan through a StringInfo.
+ *-------------------------
+ */
+typedef struct StringInfoData
+{
+ char *data;
+ int len;
+ int maxlen;
+ int cursor;
+} StringInfoData;
+
+typedef StringInfoData *StringInfo;
+
+
+/*------------------------
+ * There are two ways to create a StringInfo object initially:
+ *
+ * StringInfo stringptr = makeStringInfo();
+ * Both the StringInfoData and the data buffer are palloc'd.
+ *
+ * StringInfoData string;
+ * initStringInfo(&string);
+ * The data buffer is palloc'd but the StringInfoData is just local.
+ * This is the easiest approach for a StringInfo object that will
+ * only live as long as the current routine.
+ *
+ * To destroy a StringInfo, pfree() the data buffer, and then pfree() the
+ * StringInfoData if it was palloc'd. There's no special support for this.
+ *
+ * NOTE: some routines build up a string using StringInfo, and then
+ * release the StringInfoData but return the data string itself to their
+ * caller. At that point the data string looks like a plain palloc'd
+ * string.
+ *-------------------------
+ */
+
+/*------------------------
+ * makeStringInfo
+ * Create an empty 'StringInfoData' & return a pointer to it.
+ */
+extern StringInfo makeStringInfo(void);
+
+/*------------------------
+ * initStringInfo
+ * Initialize a StringInfoData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+extern void initStringInfo(StringInfo str);
+
+/*------------------------
+ * resetStringInfo
+ * Clears the current content of the StringInfo, if any. The
+ * StringInfo remains valid.
+ */
+extern void resetStringInfo(StringInfo str);
+
+/*------------------------
+ * appendStringInfo
+ * Format text data under the control of fmt (an sprintf-style format string)
+ * and append it to whatever is already in str. More space is allocated
+ * to str if necessary. This is sort of like a combination of sprintf and
+ * strcat.
+ */
+extern void
+appendStringInfo(StringInfo str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendStringInfoVA
+ * Attempt to format text data under the control of fmt (an sprintf-style
+ * format string) and append it to whatever is already in str. If successful
+ * return true; if not (because there's not enough space), return false
+ * without modifying str. Typically the caller would enlarge str and retry
+ * on false return --- see appendStringInfo for standard usage pattern.
+ */
+extern bool appendStringInfoVA(StringInfo str, const char *fmt, va_list args);
+
+/*------------------------
+ * appendStringInfoString
+ * Append a null-terminated string to str.
+ * Like appendStringInfo(str, "%s", s) but faster.
+ */
+extern void appendStringInfoString(StringInfo str, const char *s);
+
+/*------------------------
+ * appendStringInfoChar
+ * Append a single byte to str.
+ * Like appendStringInfo(str, "%c", ch) but much faster.
+ */
+extern void appendStringInfoChar(StringInfo str, char ch);
+
+/*------------------------
+ * appendStringInfoCharMacro
+ * As above, but a macro for even more speed where it matters.
+ * Caution: str argument will be evaluated multiple times.
+ */
+#define appendStringInfoCharMacro(str,ch) \
+ (((str)->len + 1 >= (str)->maxlen) ? \
+ appendStringInfoChar(str, ch) : \
+ (void)((str)->data[(str)->len] = (ch), (str)->data[++(str)->len] = '\0'))
+
+/*------------------------
+ * appendBinaryStringInfo
+ * Append arbitrary binary data to a StringInfo, allocating more space
+ * if necessary.
+ */
+extern void appendBinaryStringInfo(StringInfo str,
+ const char *data, int datalen);
+
+/*------------------------
+ * enlargeStringInfo
+ * Make sure a StringInfo's buffer can hold at least 'needed' more bytes.
+ */
+extern void enlargeStringInfo(StringInfo str, int needed);
+
+#endif /* STRINGINFO_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index f255c44d1c..078b6733e7 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.223 2009/06/11 14:49:11 momjian Exp $
*
@@ -157,6 +158,9 @@ typedef enum NodeTag
T_JoinExpr,
T_FromExpr,
T_IntoClause,
+#ifdef PGXC
+ T_DistributeBy,
+#endif
/*
* TAGS FOR EXPRESSION STATE NODES (execnodes.h)
@@ -337,6 +341,7 @@ typedef enum NodeTag
T_CreateUserMappingStmt,
T_AlterUserMappingStmt,
T_DropUserMappingStmt,
+ T_ExecDirectStmt,
/*
* TAGS FOR PARSE TREE NODES (parsenodes.h)
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 7793f66f20..e0515ba95d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -12,6 +12,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.395 2009/06/18 01:27:02 tgl Exp $
*
@@ -1335,6 +1336,9 @@ typedef struct CreateStmt
List *options; /* options from WITH clause */
OnCommitAction oncommit; /* what do we do at COMMIT? */
char *tablespacename; /* table space to use, or NULL */
+#ifdef PGXC
+ DistributeBy *distributeby; /* distribution to use, or NULL */
+#endif
} CreateStmt;
/* ----------
@@ -2389,4 +2393,17 @@ typedef struct AlterTSConfigurationStmt
bool missing_ok; /* for DROP - skip error if missing? */
} AlterTSConfigurationStmt;
+/* PGXC_BEGIN */
+/*
+ * EXECUTE DIRECT statement
+ */
+typedef struct ExecDirectStmt
+{
+ NodeTag type;
+ bool coordinator;
+ List *nodes;
+ char *query;
+} ExecDirectStmt;
+/* PGXC_END */
+
#endif /* PARSENODES_H */
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index a41b0e2f7d..36c5e6e633 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -9,6 +9,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/nodes/primnodes.h,v 1.149 2009/06/11 14:49:11 momjian Exp $
*
@@ -1174,4 +1175,30 @@ typedef struct FromExpr
Node *quals; /* qualifiers on join, if any */
} FromExpr;
+#ifdef PGXC
+/*----------
+ * DistributionType - how to distribute the data
+ *
+ *----------
+ */
+typedef enum DistributionType
+{
+ DISTTYPE_REPLICATION, /* Replicated */
+ DISTTYPE_HASH, /* Hash partitioned */
+ DISTTYPE_ROUNDROBIN /* Round Robin */
+} DistributionType;
+
+/*----------
+ * DistributeBy - represents a DISTRIBUTE BY clause in a CREATE TABLE statement
+ *
+ *----------
+ */
+typedef struct DistributeBy
+{
+ NodeTag type;
+ DistributionType disttype; /* Distribution type */
+ char *colname; /* Distribution column name */
+} DistributeBy;
+#endif
+
#endif /* PRIMNODES_H */
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 23f5d87a7a..aec7b6b3d9 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -9,6 +9,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/include/parser/kwlist.h,v 1.2 2009/04/06 08:42:53 heikki Exp $
@@ -90,6 +91,7 @@ PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD)
PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD)
+PG_KEYWORD("coordinator", COORDINATOR, UNRESERVED_KEYWORD)
PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD)
PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD)
PG_KEYWORD("create", CREATE, RESERVED_KEYWORD)
@@ -125,9 +127,13 @@ PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD)
PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
+PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD)
PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("distribute", DISTRIBUTE, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("do", DO, RESERVED_KEYWORD)
PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD)
@@ -169,6 +175,9 @@ PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD)
PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD)
PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD)
PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("hash", HASH, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("having", HAVING, RESERVED_KEYWORD)
PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD)
PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD)
@@ -243,6 +252,7 @@ PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
PG_KEYWORD("nocreatedb", NOCREATEDB, UNRESERVED_KEYWORD)
PG_KEYWORD("nocreaterole", NOCREATEROLE, UNRESERVED_KEYWORD)
PG_KEYWORD("nocreateuser", NOCREATEUSER, UNRESERVED_KEYWORD)
+PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD)
PG_KEYWORD("noinherit", NOINHERIT, UNRESERVED_KEYWORD)
PG_KEYWORD("nologin", NOLOGIN_P, UNRESERVED_KEYWORD)
PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
@@ -308,6 +318,9 @@ PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD)
PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD)
PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD)
PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("replication", REPLICATION, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD)
PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD)
PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD)
@@ -315,8 +328,14 @@ PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD)
PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD)
PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD)
PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("robin", ROBIN, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD)
PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("round", ROUND, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("row", ROW, COL_NAME_KEYWORD)
PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD)
PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD)
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index 089c907c0e..319699381d 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/parser/parse_utilcmd.h,v 1.4 2009/01/01 17:24:00 momjian Exp $
*
@@ -24,5 +25,8 @@ extern IndexStmt *transformIndexStmt(IndexStmt *stmt, const char *queryString);
extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
List **actions, Node **whereClause);
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
+#ifdef PGXC
+extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname);
+#endif
#endif /* PARSE_UTILCMD_H */
diff --git a/src/include/pgxc/combiner.h b/src/include/pgxc/combiner.h
new file mode 100644
index 0000000000..8c02627b57
--- /dev/null
+++ b/src/include/pgxc/combiner.h
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * combiner.h
+ *
+ * Combine responses from multiple Data Nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMBINER_H
+#define COMBINER_H
+
+#include "postgres.h"
+#include "tcop/dest.h"
+
+typedef enum
+{
+ COMBINE_TYPE_NONE, /* it is known that no row count, do not parse */
+ COMBINE_TYPE_SUM, /* sum row counts (partitioned, round robin) */
+ COMBINE_TYPE_AVG /* calculate average (replicated) */
+} CombineType;
+
+typedef enum
+{
+ REQUEST_TYPE_NOT_DEFINED, /* not determined yet */
+ REQUEST_TYPE_COMMAND, /* OK or row count response */
+ REQUEST_TYPE_QUERY, /* Row description response */
+ REQUEST_TYPE_COPY_IN, /* Copy In response */
+ REQUEST_TYPE_COPY_OUT /* Copy Out response */
+} RequestType;
+
+
+typedef struct
+{
+ int node_count;
+ CombineType combine_type;
+ CommandDest dest;
+ int command_complete_count;
+ int row_count;
+ RequestType request_type;
+ int description_count;
+ List *simple_aggregates;
+} ResponseCombinerData;
+
+
+typedef ResponseCombinerData *ResponseCombiner;
+
+extern ResponseCombiner CreateResponseCombiner(int node_count,
+ CombineType combine_type, CommandDest dest);
+extern int CombineResponse(ResponseCombiner combiner, char msg_type,
+ char *msg_body, size_t len);
+extern bool ValidateAndCloseCombiner(ResponseCombiner combiner);
+extern bool ValidateAndResetCombiner(ResponseCombiner combiner);
+extern void AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates);
+
+#endif /* COMBINER_H */
diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h
new file mode 100644
index 0000000000..e140445a28
--- /dev/null
+++ b/src/include/pgxc/datanode.h
@@ -0,0 +1,76 @@
+/*-------------------------------------------------------------------------
+ *
+ * datanode.h
+ *
+ * Utility functions to communicate to Data Node
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef DATANODE_H
+#define DATANODE_H
+#include "combiner.h"
+#include "nodes/pg_list.h"
+#include "utils/snapshot.h"
+#include <unistd.h>
+
+/* Connection to data node maintained by Pool Manager */
+typedef struct PGconn NODE_CONNECTION;
+
+/* Helper structure to access data node from Session */
+typedef enum
+{
+ DN_CONNECTION_STATE_IDLE,
+ DN_CONNECTION_STATE_BUSY,
+ DN_CONNECTION_STATE_COMPLETED,
+ DN_CONNECTION_STATE_ERROR
+
+} DNConnectionState;
+
+struct data_node_handle
+{
+ /* fd of the connection */
+ int sock;
+ /* Connection state */
+ char transaction_status;
+ DNConnectionState state;
+ char *error;
+ /* Output buffer */
+ char *outBuffer;
+ size_t outSize;
+ size_t outEnd;
+ /* Input buffer */
+ char *inBuffer;
+ size_t inSize;
+ size_t inStart;
+ size_t inEnd;
+ size_t inCursor;
+};
+typedef struct data_node_handle DataNodeHandle;
+
+extern void InitMultinodeExecutor(void);
+
+/* Open/close connection routines (invoked from Pool Manager) */
+extern char *DataNodeConnStr(char *host, char *port, char *dbname, char *user,
+ char *password);
+extern NODE_CONNECTION *DataNodeConnect(char *connstr);
+extern void DataNodeClose(NODE_CONNECTION * conn);
+extern int DataNodeConnected(NODE_CONNECTION * conn);
+extern int DataNodeConnClean(NODE_CONNECTION * conn);
+extern void DataNodeCleanAndRelease(int code, Datum arg);
+
+/* Multinode Executor */
+extern void DataNodeBegin(void);
+extern int DataNodeCommit(CommandDest dest);
+extern int DataNodeRollback(CommandDest dest);
+
+extern int DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot, bool force_autocommit, List *simple_aggregates, bool is_read_only);
+
+#endif
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
new file mode 100644
index 0000000000..1320b3c6f6
--- /dev/null
+++ b/src/include/pgxc/locator.h
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * locator.h
+ * Externally declared locator functions
+ *
+ *
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCATOR_H
+#define LOCATOR_H
+
+#define LOCATOR_TYPE_REPLICATED 'R'
+#define LOCATOR_TYPE_HASH 'H'
+#define LOCATOR_TYPE_RANGE 'G'
+#define LOCATOR_TYPE_SINGLE 'S'
+#define LOCATOR_TYPE_RROBIN 'N'
+#define LOCATOR_TYPE_CUSTOM 'C'
+
+#define HASH_SIZE 4096
+#define HASH_MASK 0x00000FFF;
+
+#include "utils/relcache.h"
+
+
+typedef int PartAttrNumber;
+
+typedef struct
+{
+ Oid relid;
+ char locatorType;
+ PartAttrNumber partAttrNum; /* if partitioned */
+ char *partAttrName; /* if partitioned */
+ int nodeCount;
+ List *nodeList;
+ ListCell *roundRobinNode; /* points to next one to use */
+} RelationLocInfo;
+
+
+extern char *PreferredDataNodes;
+
+extern void InitRelationLocInfo();
+extern char GetLocatorType(Oid relid);
+extern char ConvertToLocatorType(int disttype);
+
+extern char *GetRelationHashColumn(RelationLocInfo * rel_loc_info);
+extern RelationLocInfo *GetRelationLocInfo(Oid relid);
+extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo * src_info);
+extern List *GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue,
+ int isRead);
+extern bool IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name);
+extern bool IsHashColumnForRelId(Oid relid, char *part_col_name);
+extern int GetRoundRobinNode(Oid relid);
+
+extern bool IsHashDistributable(Oid col_type);
+extern List *GetAllNodes(void);
+extern int GetAnyDataNode(void);
+extern void RelationBuildLocator(Relation rel);
+extern void FreeRelationLocInfo(RelationLocInfo * relationLocInfo);
+
+#endif /* LOCATOR_H */
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
new file mode 100644
index 0000000000..09ff2c0ada
--- /dev/null
+++ b/src/include/pgxc/pgxc.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgxc.h
+ * PG-XC
+ *
+ *
+ * Portions Copyright (c) 1996-2010 PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifdef PGXC
+
+extern bool isPGXCCoordinator;
+extern bool isPGXCDataNode;
+
+#define IS_PGXC_COORDINATOR isPGXCCoordinator
+#define IS_PGXC_DATANODE isPGXCDataNode
+
+#endif /* PGXC */
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
new file mode 100644
index 0000000000..eda25a72bb
--- /dev/null
+++ b/src/include/pgxc/planner.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.h
+ * Externally declared locator functions
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGXCPLANNER_H
+#define PGXCPLANNER_H
+
+/* for Query_Plan.exec_loc_type can have these OR'ed*/
+#define EXEC_ON_COORD 0x1
+#define EXEC_ON_DATA_NODES 0x2
+
+/* Contains instructions on processing a step of a query.
+ * In the prototype this will be simple, but it will eventually
+ * evolve into a GridSQL-style QueryStep.
+ */
+typedef struct
+{
+ char *sql_statement;
+ List *nodelist;
+ List *simple_aggregates; /* simple aggregate to combine on this
+ * step */
+} Query_Step;
+
+
+/*
+ * The PGXC plan to execute.
+ * In the prototype this will be simple, and queryStepList will
+ * contain just one step.
+ */
+typedef struct
+{
+ int exec_loc_type;
+ bool force_autocommit; /* For CREATE DATABASE */
+ List *query_step_list; /* List of QuerySteps */
+} Query_Plan;
+
+
+/* For handling simple aggregates (no group by present)
+ * For now, only MAX will be supported.
+ */
+typedef enum
+{
+ AGG_TYPE_MAX,
+ AGG_TYPE_MIN,
+ AGG_TYPE_COUNT,
+ AGG_TYPE_SUM,
+ AGG_TYPE_AVG
+} SimpleAggType;
+
+
+/* For handling simple aggregates */
+/* For now, only support int/long types */
+typedef struct
+{
+ int agg_type; /* SimpleAggType enum */
+ int column_pos; /* Only use 1 for now */
+ unsigned long ulong_value;
+ /* Datum agg_value; PGXCTODO - use Datum, support more types */
+ int data_len;
+ int agg_data_type;
+ int response_count;
+} SimpleAgg;
+
+/* forbid SQL if unsafe, useful to turn off for development */
+extern bool StrictStatementChecking;
+
+/* forbid SELECT even multi-node ORDER BY */
+extern bool StrictSelectChecking;
+
+extern Query_Plan *
+ GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list);
+extern void
+ FreeQueryPlan(Query_Plan * query_plan);
+extern bool IsHashDistributable(Oid col_type);
+
+#endif /* PGXCPLANNER_H */
diff --git a/src/include/pgxc/poolcomm.h b/src/include/pgxc/poolcomm.h
new file mode 100644
index 0000000000..3c62f0662e
--- /dev/null
+++ b/src/include/pgxc/poolcomm.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolcomm.h
+ *
+ * Definitions for the Pooler-Seesion communications.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef POOLCOMM_H
+#define POOLCOMM_H
+
+#include "lib/stringinfo.h"
+
+#define POOL_BUFFER_SIZE 1024
+#define Socket(port) (port).fdsock
+
+typedef struct
+{
+ /* file descriptors */
+ int fdsock;
+ /* receive buffer */
+ int RecvLength;
+ int RecvPointer;
+ char RecvBuffer[POOL_BUFFER_SIZE];
+ /* send buffer */
+ int SendPointer;
+ char SendBuffer[POOL_BUFFER_SIZE];
+} PoolPort;
+
+extern int pool_listen(unsigned short port, const char *unixSocketName);
+extern int pool_connect(unsigned short port, const char *unixSocketName);
+extern int pool_getbyte(PoolPort * port);
+extern int pool_pollbyte(PoolPort * port);
+extern int pool_getmessage(PoolPort * port, StringInfo s, int maxlen);
+extern int pool_getbytes(PoolPort * port, char *s, size_t len);
+extern int pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len);
+extern int pool_putbytes(PoolPort * port, const char *s, size_t len);
+extern int pool_flush(PoolPort * port);
+extern int pool_sendfds(PoolPort * port, int *fds, int count);
+extern int pool_recvfds(PoolPort * port, int *fds, int count);
+
+#endif /* POOLCOMM_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
new file mode 100644
index 0000000000..6e88fca3bc
--- /dev/null
+++ b/src/include/pgxc/poolmgr.h
@@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolmgr.h
+ *
+ * Definitions for the data nodes connection pool.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef POOLMGR_H
+#define POOLMGR_H
+#include <sys/time.h>
+#include "datanode.h"
+#include "poolcomm.h"
+#include "storage/pmsignal.h"
+
+#define MAX_IDLE_TIME 60
+
+/* TODO move? */
+typedef struct
+{
+ char *host;
+ char *port;
+ char *uname;
+ char *password;
+} DataNodeConnectionInfo;
+
+/* Connection pool entry */
+typedef struct
+{
+ struct timeval released;
+ NODE_CONNECTION *conn;
+} DataNodePoolSlot;
+
+/* Pool of connections to specified data nodes */
+typedef struct
+{
+ char *connstr;
+ int freeSize; /* available connections */
+ int size; /* total pool size */
+ DataNodePoolSlot **slot;
+} DataNodePool;
+
+/* All pools for specified database */
+typedef struct databasepool
+{
+ Oid databaseId;
+ char *database;
+ DataNodePool **nodePools; /* one for each data node */
+ struct databasepool *next;
+} DatabasePool;
+
+/* Agent of client session (Pool Manager side)
+ * Acts as a session manager, grouping connections together
+ */
+typedef struct
+{
+ /* communication channel */
+ PoolPort port;
+ DatabasePool *pool;
+ DataNodePoolSlot **connections; /* one for each data node */
+} PoolAgent;
+
+/* Handle to the pool manager (Session's side) */
+typedef struct
+{
+ /* communication channel */
+ PoolPort port;
+} PoolHandle;
+
+extern int NumDataNodes;
+extern int MinPoolSize;
+extern int MaxPoolSize;
+extern int PoolerPort;
+
+extern bool PersistentConnections;
+
+extern char *DataNodeHosts;
+extern char *DataNodePorts;
+extern char *DataNodeUsers;
+extern char *DataNodePwds;
+
+/* Initialize internal structures */
+extern int PoolManagerInit(void);
+
+/* Destroy internal structures */
+extern int PoolManagerDestroy(void);
+
+/*
+ * Get handle to pool manager. This function should be called just before
+ * forking off new session. It creates PoolHandle, PoolAgent and a pipe between
+ * them. PoolAgent is stored within Postmaster's memory context and Session
+ * closes it later. PoolHandle is returned and should be store in a local
+ * variable. After forking off it can be stored in global memory, so it will
+ * only be accessible by the process running the session.
+ */
+extern PoolHandle *GetPoolManagerHandle(void);
+
+/*
+ * Called from Postmaster(Coordinator) after fork. Close one end of the pipe and
+ * free memory occupied by PoolHandler
+ */
+extern void PoolManagerCloseHandle(PoolHandle * handle);
+
+/*
+ * Gracefully close connection to the PoolManager
+ */
+extern void PoolManagerDisconnect(PoolHandle * handle);
+
+/*
+ * Called from Session process after fork(). Associate handle with session
+ * for subsequent calls. Associate session with specified database and
+ * initialize respective connection pool
+ */
+extern void PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes);
+
+/* Get pooled connections */
+extern int *PoolManagerGetConnections(List *nodelist);
+
+/* Retun connections back to the pool */
+extern void PoolManagerReleaseConnections(void);
+
+#endif
diff --git a/src/include/postgres.h b/src/include/postgres.h
index c1e4f77386..e8bfd5a391 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -9,6 +9,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1995, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/postgres.h,v 1.92 2009/01/01 17:23:55 momjian Exp $
*
@@ -693,4 +694,7 @@ extern int ExceptionalCondition(const char *conditionName,
const char *errorType,
const char *fileName, int lineNumber);
+//#define PGXC_COORD // for PGXC coordinator compiling
+//#define PGXC_DATANODE // for PGXC data node compiling
+
#endif /* POSTGRES_H */
diff --git a/src/include/postmaster/autovacuum.h b/src/include/postmaster/autovacuum.h
index 3175487af3..952291bcb0 100644
--- a/src/include/postmaster/autovacuum.h
+++ b/src/include/postmaster/autovacuum.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/postmaster/autovacuum.h,v 1.15 2009/01/01 17:24:01 momjian Exp $
*
@@ -60,4 +61,8 @@ extern void AutovacuumLauncherIAm(void);
extern Size AutoVacuumShmemSize(void);
extern void AutoVacuumShmemInit(void);
+#ifdef PGXC /* PGXC_DATANODE */
+bool IsAutoVacuumWorkerProcess(void);
+#endif
+
#endif /* AUTOVACUUM_H */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index b250d3f0f2..66a920ded0 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -143,8 +143,9 @@ typedef struct PROC_HDR
* normal operation. Startup process also consumes one slot, but WAL
* writer and autovacuum launcher are launched only after it has
* exited.
+ * Also pool manager process is added
*/
-#define NUM_AUXILIARY_PROCS 3
+#define NUM_AUXILIARY_PROCS 4
/* configurable options */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index fab84ee1a0..4431e1bc54 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.26 2009/06/11 14:49:12 momjian Exp $
*
@@ -26,6 +27,10 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
extern void ProcArrayClearTransaction(PGPROC *proc);
+#ifdef PGXC /* PGXC_DATANODE */
+extern void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip);
+extern void UnsetGlobalSnapshotData(void);
+#endif /* PGXC */
extern Snapshot GetSnapshotData(Snapshot snapshot);
extern bool TransactionIdIsInProgress(TransactionId xid);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index b50944a547..9c87386288 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -76,7 +76,9 @@ enum config_group
COMPAT_OPTIONS_CLIENT,
PRESET_OPTIONS,
CUSTOM_OPTIONS,
- DEVELOPER_OPTIONS
+ DEVELOPER_OPTIONS,
+ DATA_NODES,
+ GTM
};
/*
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index ca9913bda3..5f3a482877 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -6,6 +6,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.114 2009/06/11 14:49:13 momjian Exp $
*
@@ -20,6 +21,9 @@
#include "catalog/pg_index.h"
#include "fmgr.h"
#include "nodes/bitmapset.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#endif
#include "rewrite/prs2lock.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
@@ -205,6 +209,9 @@ typedef struct RelationData
/* use "struct" here to avoid needing to include pgstat.h: */
struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
+#ifdef PGXC
+ RelationLocInfo *rd_locator_info;
+#endif
} RelationData;
/*
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index e5003b669a..835ba95291 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -5,6 +5,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/utils/snapshot.h,v 1.5 2009/06/11 14:49:13 momjian Exp $
*
@@ -46,7 +47,11 @@ typedef struct SnapshotData
*/
TransactionId xmin; /* all XID < xmin are visible to me */
TransactionId xmax; /* all XID >= xmax are invisible to me */
+ TransactionId recent_global_xmin;
uint32 xcnt; /* # of xact ids in xip[] */
+#ifdef PGXC /* PGXC_COORD */
+ uint32 max_xcnt; /* Max # of xact in xip[] */
+#endif
TransactionId *xip; /* array of xact IDs in progress */
/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
int32 subxcnt; /* # of xact ids in subxip[], -1 if overflow */
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 1428b28d15..e038041519 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -8,6 +8,7 @@
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
*
* $PostgreSQL: pgsql/src/include/utils/syscache.h,v 1.74 2009/01/01 17:24:02 momjian Exp $
*
@@ -64,6 +65,9 @@ enum SysCacheIdentifier
OPEROID,
OPFAMILYAMNAMENSP,
OPFAMILYOID,
+#ifdef PGXC
+ PGXCCLASSRELID,
+#endif
PROCNAMEARGSNSP,
PROCOID,
RELNAMENSP,