Postgres-XC version 0.9

Application of patch PGXC-PG_REL8_4_3.patch.gz on PostgreSQL version 8.4.3
author: Michael P 2010-04-01 01:09:52 +0000
committer: Pavan Deolasee 2011-05-19 16:38:44 +0000
commit: 9b1cd1ef2e746b9d68085ecd37eabaa38e2a82f1 (patch)
tree: f220dc274f1d69eb685e822b9079e829525f5d4a
parent: 4d53a2f9699547bdc12831d2860c9d44c465e805 (diff)
150 files changed, 33460 insertions, 55 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index e840c8ce6a..f3777962c5 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -39,7 +39,7 @@ WANTED_DIRS = \
 		tablefunc	\
 		test_parser	\
 		tsearch2	\
-		vacuumlo
+		vacuumlo	
 
 ifeq ($(with_openssl),yes)
 WANTED_DIRS += sslinfo
diff --git a/src/Makefile b/src/Makefile
index 7b00776c4b..02ba3b3926 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,6 +16,8 @@ include Makefile.global
 all install installdirs uninstall distprep:
 	$(MAKE) -C port $@
 	$(MAKE) -C timezone $@
+	# GTM should be built before backend because of dependancy
+	$(MAKE) -C gtm $@
 	$(MAKE) -C backend $@
 	$(MAKE) -C backend/utils/mb/conversion_procs $@
 	$(MAKE) -C backend/snowball $@
@@ -47,6 +49,7 @@ uninstall-local:
 clean:
 	$(MAKE) -C port $@
 	$(MAKE) -C timezone $@
+	$(MAKE) -C gtm $@
 	$(MAKE) -C backend $@
 	$(MAKE) -C backend/snowball $@
 	$(MAKE) -C include $@
@@ -61,6 +64,7 @@ clean:
 distclean maintainer-clean:
 	$(MAKE) -C port $@
 	$(MAKE) -C timezone $@
+	$(MAKE) -C gtm $@
 	$(MAKE) -C backend $@
 	$(MAKE) -C backend/snowball $@
 	$(MAKE) -C include $@
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 86526d5f1a..4ae230dbd5 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -15,8 +15,8 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 
 SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
-	main nodes optimizer port postmaster regex rewrite \
-	storage tcop tsearch utils $(top_builddir)/src/timezone
+	pgxc main nodes optimizer port postmaster regex rewrite \
+	storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
 
 include $(srcdir)/common.mk
 
@@ -26,7 +26,19 @@ LOCALOBJS += utils/probes.o
 endif
 endif
 
-OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a
+OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
+	$(top_builddir)/src/interfaces/libpq/fe-connect.o \
+	$(top_builddir)/src/interfaces/libpq/fe-secure.o \
+	$(top_builddir)/src/interfaces/libpq/fe-misc.o \
+	$(top_builddir)/src/interfaces/libpq/fe-protocol3.o \
+	$(top_builddir)/src/interfaces/libpq/fe-protocol2.o \
+	$(top_builddir)/src/interfaces/libpq/fe-exec.o \
+	$(top_builddir)/src/interfaces/libpq/fe-auth.o \
+	$(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
+	$(top_builddir)/src/port/libpgport_srv.a \
+	$(top_builddir)/src/gtm/client/libgtmclient.a \
+	$(top_builddir)/src/gtm/common/libgtm.a \
+	$(top_builddir)/src/gtm/libpq/libpqcomm.a	
 
 # We put libpgport into OBJS, so remove it from LIBS; also add libldap
 LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
@@ -34,6 +46,8 @@ LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
 # The backend doesn't need everything that's in LIBS, however
 LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS))
 
+# LIBS := $(LIBS) -lpqcomm
+# LDFLAGS += -L$(top_builddir)/src/gtm/libpg
 ##########################################################################
 
 all: submake-libpgport postgres $(POSTGRES_IMP)
@@ -43,7 +57,7 @@ ifneq ($(PORTNAME), win32)
 ifneq ($(PORTNAME), aix)
 
 postgres: $(OBJS)
-	$(CC) $(CFLAGS) $(LDFLAGS) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
+	$(CC) $(CFLAGS) $(LDFLAGS) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
 
 endif
 endif
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 38cfe1a277..fe34e4eaaa 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -12,9 +12,12 @@ subdir = src/backend/access/transam
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o
+OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o gtm.o
 
 include $(top_srcdir)/src/backend/common.mk
 
 # ensure that version checks in xlog.c get recompiled when catversion.h changes
 xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h
+
+libpg-fe.h:
+	$(LN_S) $(top_builddir)/contrib/gtm/client/libpg-fe.h $(top_srcdir)/src/include/
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 8544725abb..8dc23f7039 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -25,6 +25,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.53 2009/06/11 14:48:54 momjian Exp $
  *
@@ -67,6 +68,11 @@
 #define GetLSNIndex(slotno, xid)	((slotno) * CLOG_LSNS_PER_PAGE + \
 	((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
 
+#ifdef PGXC 
+/* Check if there is about a 1 billion XID difference for XID wraparound */
+#define CLOG_WRAP_CHECK_DELTA (2^30 / CLOG_XACTS_PER_PAGE)
+#endif
+
 
 /*
  * Link to shared-memory data structures for CLOG control
@@ -150,6 +156,11 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
 	Assert(status == TRANSACTION_STATUS_COMMITTED ||
 		   status == TRANSACTION_STATUS_ABORTED);
 
+	if (status == TRANSACTION_STATUS_COMMITTED)
+		elog(DEBUG1, "Record transaction commit %u", xid);
+	else
+		elog(DEBUG1, "Record transaction abort %u", xid);
+
 	/*
 	 * See how many subxids, if any, are on the same page as the parent, if
 	 * any.
@@ -565,11 +576,31 @@ ExtendCLOG(TransactionId newestXact)
 	 * No work except at first XID of a page.  But beware: just after
 	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
 	 */
+#ifdef PGXC  /* PGXC_COORD || PGXC_DATANODE */
+	/* 
+	 * In PGXC, it may be that a node is not involved in a transaction,
+	 * and therefore will be skipped, so we need to detect this by using
+	 * the latest_page_number instead of the pg index.
+	 *
+	 * Also, there is a special case of when transactions wrap-around that
+	 * we need to detect.
+	 */
+	pageno = TransactionIdToPage(newestXact);
+
+	/* 
+	 * The first condition makes sure we did not wrap around 
+	 * The second checks if we are still using the same page
+	 */
+	if (ClogCtl->shared->latest_page_number - pageno <= CLOG_WRAP_CHECK_DELTA 
+			&& pageno <= ClogCtl->shared->latest_page_number)
+		return;
+#else
 	if (TransactionIdToPgIndex(newestXact) != 0 &&
 		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
 		return;
 
 	pageno = TransactionIdToPage(newestXact);
+#endif
 
 	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
@@ -579,7 +610,6 @@ ExtendCLOG(TransactionId newestXact)
 	LWLockRelease(CLogControlLock);
 }
 
-
 /*
  * Remove all CLOG segments before the one holding the passed transaction ID
  *
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
new file mode 100644
index 0000000000..2ecc96a4ac
--- /dev/null
+++ b/src/backend/access/transam/gtm.c
@@ -0,0 +1,226 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.c
+ * 
+ *	  Module interfacing with GTM 
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "utils/elog.h"
+
+/* Configuration variables */
+char *GtmHost = "localhost";
+int GtmPort = 6666;
+int GtmCoordinatorId = 1;
+
+extern bool FirstSnapshotSet;
+
+static GTM_Conn *conn;
+
+#define CheckConnection() \
+	if (GTMPQstatus(conn) != CONNECTION_OK) InitGTM()
+
+
+bool IsGTMConnected()
+{
+	return conn != NULL;
+}
+
+void
+InitGTM()
+{
+	/* 256 bytes should be enough */
+	char conn_str[256];
+
+	sprintf(conn_str, "host=%s port=%d coordinator_id=%d", GtmHost, GtmPort, GtmCoordinatorId);
+
+	conn = PQconnectGTM(conn_str);
+	if (GTMPQstatus(conn) != CONNECTION_OK)
+	{
+		int save_errno = errno;
+
+		ereport(WARNING,
+				(errcode(ERRCODE_CONNECTION_EXCEPTION),
+				 errmsg("can not connect to GTM: %m")));		
+
+		errno = save_errno;		
+
+		CloseGTM();
+	}
+}
+
+void
+CloseGTM()
+{
+	GTMPQfinish(conn);
+	conn = NULL;
+}
+
+GlobalTransactionId
+BeginTranGTM()
+{
+	GlobalTransactionId  xid = InvalidGlobalTransactionId;
+
+	CheckConnection();
+	// TODO Isolation level
+	if (conn)
+		xid =  begin_transaction(conn, GTM_ISOLATION_RC);
+
+	/* If something went wrong (timeout), try and reset GTM connection 
+	 * and retry. This is safe at the beginning of a transaction.
+	 */
+	if (!TransactionIdIsValid(xid))
+	{
+		CloseGTM();
+		InitGTM();
+		if (conn)
+			xid =  begin_transaction(conn, GTM_ISOLATION_RC);
+	}
+	return xid;
+}
+
+GlobalTransactionId
+BeginTranAutovacuumGTM()
+{
+	GlobalTransactionId  xid = InvalidGlobalTransactionId;
+
+	CheckConnection();
+	// TODO Isolation level
+	if (conn)
+		xid =  begin_transaction_autovacuum(conn, GTM_ISOLATION_RC);
+
+	/* If something went wrong (timeout), try and reset GTM connection and retry.
+	 * This is safe at the beginning of a transaction.
+	 */
+	if (!TransactionIdIsValid(xid))
+	{
+		CloseGTM();
+		InitGTM();
+		if (conn)
+			xid =  begin_transaction_autovacuum(conn, GTM_ISOLATION_RC);
+	}
+	return xid;
+}
+
+int
+CommitTranGTM(GlobalTransactionId gxid)
+{
+	int ret;
+
+	if (!GlobalTransactionIdIsValid(gxid))
+		return 0;
+	CheckConnection();
+	ret = commit_transaction(conn, gxid);
+
+	/* If something went wrong (timeout), try and reset GTM connection. 
+	 * We will close the transaction locally anyway, and closing GTM will force
+	 * it to be closed on GTM.
+	 */
+	if (ret < 0)
+	{
+		CloseGTM();
+		InitGTM();
+	}
+	return ret;
+}
+
+int
+RollbackTranGTM(GlobalTransactionId gxid)
+{
+	int ret;
+
+	if (!GlobalTransactionIdIsValid(gxid))
+		return 0;
+	CheckConnection();
+	ret = abort_transaction(conn, gxid);
+
+	/* If something went wrong (timeout), try and reset GTM connection. 
+	 * We will abort the transaction locally anyway, and closing GTM will force
+	 * it to end on GTM.
+	 */
+	if (ret < 0)
+	{
+		CloseGTM();
+		InitGTM();
+	}
+	return ret;
+}
+
+GTM_Snapshot
+GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped)
+{
+	GTM_Snapshot ret_snapshot = NULL;
+	CheckConnection();
+	if (conn)
+		ret_snapshot = get_snapshot(conn, gxid, canbe_grouped);
+	if (ret_snapshot == NULL)
+	{
+		CloseGTM();
+		InitGTM();
+	}
+	return ret_snapshot;
+}
+
+
+/**
+ * Create a sequence on the GTM.
+ *
+ * 
+ */
+int CreateSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval,
+		GTM_Sequence maxval, GTM_Sequence startval, bool cycle)
+{
+	GTM_SequenceKeyData seqkey;
+	CheckConnection();
+	seqkey.gsk_keylen = strlen(seqname);
+	seqkey.gsk_key = seqname;
+
+	return conn ? open_sequence(conn, &seqkey, increment, minval, maxval, startval, cycle) : 0;
+}
+
+/**
+ * Get the next sequence value
+ */
+GTM_Sequence
+GetNextValGTM(char *seqname)
+{
+	GTM_Sequence ret = -1;
+	GTM_SequenceKeyData seqkey;
+	CheckConnection();
+	seqkey.gsk_keylen = strlen(seqname);
+	seqkey.gsk_key = seqname;
+
+	if (conn)
+		ret =  get_next(conn, &seqkey);
+	if (ret < 0)
+	{
+		CloseGTM();
+		InitGTM();
+	}
+	return ret;
+}
+
+/**
+ * Drop the sequence
+ */
+int
+DropSequenceGTM(char *seqname)
+{
+	GTM_SequenceKeyData seqkey;
+	CheckConnection();
+	seqkey.gsk_keylen = strlen(seqname);
+	seqkey.gsk_key = seqname;
+
+	return conn ? close_sequence(conn, &seqkey) : -1;
+}
+
+
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 9c74e995db..2695085be3 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -21,6 +21,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.24 2009/01/01 17:23:36 momjian Exp $
  *
@@ -34,6 +35,10 @@
 #include "pg_trace.h"
 #include "utils/snapmgr.h"
 
+#ifdef PGXC 
+/* Check if there is about a 1 billion XID difference for XID wraparound */
+#define SUBTRANS_WRAP_CHECK_DELTA (2^30 / SUBTRANS_XACTS_PER_PAGE)
+#endif
 
 /*
  * Defines for SubTrans page sizes.  A page is the same BLCKSZ as is used
@@ -307,11 +312,31 @@ ExtendSUBTRANS(TransactionId newestXact)
 	 * No work except at first XID of a page.  But beware: just after
 	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
 	 */
+#ifdef PGXC  /* PGXC_COORD || PGXC_DATANODE */
+	/* 
+	 * In PGXC, it may be that a node is not involved in a transaction,
+	 * and therefore will be skipped, so we need to detect this by using
+	 * the latest_page_number instead of the pg index.
+	 *
+	 * Also, there is a special case of when transactions wrap-around that
+	 * we need to detect.
+	 */
+	pageno = TransactionIdToPage(newestXact);
+
+	/* 
+	 * The first condition makes sure we did not wrap around 
+	 * The second checks if we are still using the same page
+	 */
+	if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA 
+			&& pageno <= SubTransCtl->shared->latest_page_number)
+		return;
+#else
 	if (TransactionIdToEntry(newestXact) != 0 &&
 		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
 		return;
 
 	pageno = TransactionIdToPage(newestXact);
+#endif
 
 	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 6de9c73f6e..4b9071f947 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.54 2009/06/25 19:05:52 heikki Exp $
@@ -68,7 +69,11 @@
 #define TWOPHASE_DIR "pg_twophase"
 
 /* GUC variable, can't be changed after startup */
+#ifdef PGXC
+int			max_prepared_xacts = 10;  /* We require 2PC */
+#else
 int			max_prepared_xacts = 0;
+#endif
 
 /*
  * This struct describes one global transaction that is in prepared state
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 029b2f2deb..4de1080544 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -4,6 +4,7 @@
  *	  postgres OID & XID variables support routines
  *
  * Copyright (c) 2000-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.84 2009/04/23 00:23:45 tgl Exp $
@@ -21,6 +22,10 @@
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "utils/builtins.h"
+#ifdef PGXC  
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+#endif
 
 
 /* Number of OIDs to prefetch (preallocate) per XLOG write */
@@ -29,6 +34,40 @@
 /* pointer to "variable cache" in shared memory (set up by shmem.c) */
 VariableCache ShmemVariableCache = NULL;
 
+#ifdef PGXC  /* PGXC_DATANODE */
+static TransactionId next_xid = InvalidTransactionId;
+static bool force_get_xid_from_gtm = false;
+
+/* 
+ * Set next transaction id to use
+ */
+void
+SetNextTransactionId(TransactionId xid)
+{
+	elog (DEBUG1, "[re]setting xid = %d, old_value = %d", xid, next_xid);
+	next_xid = xid;
+}
+
+/* 
+ * Allow force of getting XID from GTM 
+ * Useful for explicit VACUUM (autovacuum already handled)
+ */
+void 
+SetForceXidFromGTM(bool value)
+{
+	force_get_xid_from_gtm = value;
+}
+
+/*
+ * See if we should force using GTM
+ * Useful for explicit VACUUM (autovacuum already handled)
+ */
+bool 
+GetForceXidFromGTM(void)
+{
+	return force_get_xid_from_gtm;
+}
+#endif /* PGXC */
 
 /*
  * Allocate the next XID for my new transaction or subtransaction.
@@ -39,6 +78,9 @@ TransactionId
 GetNewTransactionId(bool isSubXact)
 {
 	TransactionId xid;
+#ifdef PGXC  
+	bool increment_xid = true;
+#endif
 
 	/*
 	 * During bootstrap initialization, we return the special bootstrap
@@ -51,9 +93,100 @@ GetNewTransactionId(bool isSubXact)
 		return BootstrapTransactionId;
 	}
 
+#ifdef PGXC  
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Get XID from GTM before acquiring the lock.
+		 * The rest of the code will handle it if after obtaining XIDs,
+		 * the lock is acquired in a different order.
+		 * This will help with GTM connection issues- we will not
+		 * block all other processes.
+		 */
+		xid = (TransactionId) BeginTranGTM();
+	}
+#endif
+
 	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 
+#ifdef PGXC  
+	if (IS_PGXC_COORDINATOR)
+	{
+		if (TransactionIdIsValid(xid)) 
+		{
+			if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid))
+			{
+				increment_xid = false;
+				ereport(DEBUG1,
+				   (errmsg("xid (%d) was less than ShmemVariableCache->nextXid (%d)",
+					   xid, ShmemVariableCache->nextXid)));
+			}
+			else
+				ShmemVariableCache->nextXid = xid;
+		}
+		else
+		{			
+			ereport(WARNING,
+			   (errmsg("Xid is invalid.")));
+	
+			/* Problem is already reported, so just remove lock and return */
+			LWLockRelease(XidGenLock);
+			return xid;
+		}	
+	} else if(IS_PGXC_DATANODE) 
+	{
+		if (IsAutoVacuumWorkerProcess())
+		{
+			if (MyProc->vacuumFlags & PROC_IN_VACUUM)
+			{
+				elog (DEBUG1, "Getting XID for autovacuum");
+				/* Try and get gxid directly from GTM. 
+				 * We use a different function so that GTM knows to
+				 * exclude it from other snapshots.
+				 */
+				next_xid = (TransactionId) BeginTranAutovacuumGTM();
+			} else {
+				elog (DEBUG1, "Getting XID for autovacuum worker (analyze)");
+				/* try and get gxid directly from GTM */
+				next_xid = (TransactionId) BeginTranGTM();
+			}
+		} else if (GetForceXidFromGTM())
+		{
+			elog (DEBUG1, "Force get XID from GTM");
+			/* try and get gxid directly from GTM */
+			next_xid = (TransactionId) BeginTranGTM();
+		}
+	
+		if (TransactionIdIsValid(next_xid))
+		{
+			xid = next_xid;
+			elog(DEBUG1, "TransactionId = %d", next_xid);
+			next_xid = InvalidTransactionId; /* reset */
+			if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid))
+			{ 
+				/* This should be ok, due to concurrency from multiple coords
+				 * passing down the xids.
+				 * We later do not want to bother incrementing the value 
+				 * in shared memory though.
+				 */
+				increment_xid = false;
+				elog(DEBUG1, "xid (%d) does not follow ShmemVariableCache->nextXid (%d)", 
+					xid, ShmemVariableCache->nextXid);
+			} else
+				ShmemVariableCache->nextXid = xid;
+		}
+		else
+		{
+			/* Fallback to default */
+			elog(LOG, "Falling back to local Xid. Was = %d, now is = %d", 
+					next_xid, ShmemVariableCache->nextXid);
+			xid = ShmemVariableCache->nextXid;
+	
+		}
+	}
+#else 
 	xid = ShmemVariableCache->nextXid;
+#endif /* PGXC */
+
 
 	/*----------
 	 * Check to see if it's safe to assign another XID.  This protects against
@@ -98,7 +231,6 @@ GetNewTransactionId(bool isSubXact)
 					 "You might also need to commit or roll back old prepared transactions.",
 					 NameStr(ShmemVariableCache->limit_datname))));
 	}
-
 	/*
 	 * If we are allocating the first XID of a new page of the commit log,
 	 * zero out that commit-log page before returning. We must do this while
@@ -117,7 +249,13 @@ GetNewTransactionId(bool isSubXact)
 	 * want the next incoming transaction to try it again.	We cannot assign
 	 * more XIDs until there is CLOG space for them.
 	 */
-	TransactionIdAdvance(ShmemVariableCache->nextXid);
+#ifdef PGXC  /* defined(PGXC_COORD) || defined(PGXC_DATANODE) */
+	/* We may not be at the max, which is ok. Do not bother to increment. 
+	 * We get this externally anyway, so it should not be needed in theory...
+	 */
+	if (increment_xid)
+#endif
+		TransactionIdAdvance(ShmemVariableCache->nextXid);
 
 	/*
 	 * We must store the new XID into the shared ProcArray before releasing
@@ -177,7 +315,6 @@ GetNewTransactionId(bool isSubXact)
 	}
 
 	LWLockRelease(XidGenLock);
-
 	return xid;
 }
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 2b6a222477..9ab3c70430 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -7,6 +7,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -20,6 +21,15 @@
 #include <time.h>
 #include <unistd.h>
 
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#include "pgxc/datanode.h"
+/* PGXC_DATANODE */
+#include "postmaster/autovacuum.h"
+#endif
 #include "access/multixact.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
@@ -51,7 +61,6 @@
 #include "utils/snapmgr.h"
 #include "pg_trace.h"
 
-
 /*
  *	User-tweakable parameters
  */
@@ -125,6 +134,9 @@ typedef enum TBlockState
 typedef struct TransactionStateData
 {
 	TransactionId transactionId;	/* my XID, or Invalid if none */
+#ifdef PGXC  /* PGXC_COORD */
+	GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */ 
+#endif
 	SubTransactionId subTransactionId;	/* my subxact ID */
 	char	   *name;			/* savepoint name, if any */
 	int			savepointLevel; /* savepoint level */
@@ -152,6 +164,9 @@ typedef TransactionStateData *TransactionState;
  */
 static TransactionStateData TopTransactionStateData = {
 	0,							/* transaction id */
+#ifdef PGXC
+	0,							/* global transaction id */
+#endif
 	0,							/* subtransaction id */
 	NULL,						/* savepoint name */
 	0,							/* savepoint level */
@@ -274,6 +289,43 @@ static void ShowTransactionStateRec(TransactionState state);
 static const char *BlockStateAsString(TBlockState blockState);
 static const char *TransStateAsString(TransState state);
 
+#ifdef PGXC  /* PGXC_COORD */
+static GlobalTransactionId GetGlobalTransactionId(TransactionState s);
+
+/* ----------------------------------------------------------------
+ *	PG-XC Functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * GetCurrentGlobalTransactionId
+ *
+ * This will return the GXID of the current transaction,
+ * getting one from the GTM if it's not yet set. Be careful to call this
+ * only inside a valid xact.
+ */
+GlobalTransactionId
+GetCurrentGlobalTransactionId(void)
+{
+	return GetGlobalTransactionId(CurrentTransactionState);
+}
+
+/*
+ * GetGlobalTransactionId
+ *
+ * This will return the GXID of the specified transaction,
+ * getting one from the GTM if it's not yet set. 
+ */
+static GlobalTransactionId
+GetGlobalTransactionId(TransactionState s)
+{
+	if (!GlobalTransactionIdIsValid(s->globalTransactionId))
+		s->globalTransactionId = (GlobalTransactionId) GetNewTransactionId(s->parent != NULL);
+
+	return s->globalTransactionId;
+}
+#endif  /* PGXC */
+
 
 /* ----------------------------------------------------------------
  *	transaction state accessors
@@ -364,6 +416,7 @@ GetCurrentTransactionId(void)
 	return s->transactionId;
 }
 
+
 /*
  *	GetCurrentTransactionIdIfAny
  *
@@ -412,6 +465,15 @@ AssignTransactionId(TransactionState s)
 	 * PG_PROC, the subtrans entry is needed to ensure that other backends see
 	 * the Xid as "running".  See GetNewTransactionId.
 	 */
+#ifdef PGXC  /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		s->transactionId = (TransactionId) GetGlobalTransactionId(s);
+		elog(DEBUG1, "New transaction id assigned = %d, isSubXact = %s", 
+			s->transactionId, isSubXact ? "true" : "false");
+	}
+	else
+#endif
 	s->transactionId = GetNewTransactionId(isSubXact);
 
 	if (isSubXact)
@@ -1458,8 +1520,11 @@ StartTransaction(void)
 	 * start processing
 	 */
 	s->state = TRANS_START;
+#ifdef PGXC  /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+		s->globalTransactionId = InvalidGlobalTransactionId;	/* until assigned */
+#endif
 	s->transactionId = InvalidTransactionId;	/* until assigned */
-
 	/*
 	 * Make sure we've reset xact state variables
 	 */
@@ -1629,7 +1694,24 @@ CommitTransaction(void)
 	latestXid = RecordTransactionCommit();
 
 	TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
-
+#ifdef PGXC
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Make sure this committed on the DataNodes, 
+	         * if so it will just return 
+		 */
+		DataNodeCommit(DestNone);
+		CommitTranGTM(s->globalTransactionId);
+	}
+	else if (IS_PGXC_DATANODE)
+	{
+		/* If we are autovacuum, commit on GTM */
+		if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+				&& IsGTMConnected())
+			CommitTranGTM((GlobalTransactionId) latestXid);
+	}
+#endif
+	
 	/*
 	 * Let others know about no transaction in progress by me. Note that this
 	 * must be done _before_ releasing locks we hold and _after_
@@ -1725,6 +1807,13 @@ CommitTransaction(void)
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
 
+#ifdef PGXC  
+	if (IS_PGXC_COORDINATOR)
+		s->globalTransactionId = InvalidGlobalTransactionId;
+	else if (IS_PGXC_DATANODE)
+		SetNextTransactionId(InvalidTransactionId);
+#endif
+
 	/*
 	 * done with commit processing, set current transaction state back to
 	 * default
@@ -1959,6 +2048,10 @@ PrepareTransaction(void)
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
 
+#ifdef PGXC /* PGXC_DATANODE */
+	if (IS_PGXC_DATANODE)
+		SetNextTransactionId(InvalidTransactionId);
+#endif
 	/*
 	 * done with 1st phase commit processing, set current transaction state
 	 * back to default
@@ -2045,7 +2138,23 @@ AbortTransaction(void)
 	latestXid = RecordTransactionAbort(false);
 
 	TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
-
+#ifdef PGXC
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Make sure this is rolled back on the DataNodes, 
+	         * if so it will just return 
+		 */
+		DataNodeRollback(DestNone);
+		RollbackTranGTM(s->globalTransactionId);
+	}
+	else if (IS_PGXC_DATANODE)
+	{
+		/* If we are autovacuum, commit on GTM */
+		if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+				&& IsGTMConnected())
+			RollbackTranGTM((GlobalTransactionId) latestXid);
+	}
+#endif
 	/*
 	 * Let others know about no transaction in progress by me. Note that this
 	 * must be done _before_ releasing locks we hold and _after_
@@ -2130,6 +2239,13 @@ CleanupTransaction(void)
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
 
+#ifdef PGXC  /* PGXC_DATANODE */
+	if (IS_PGXC_COORDINATOR)
+		s->globalTransactionId = InvalidGlobalTransactionId;
+	else if (IS_PGXC_DATANODE)
+		SetNextTransactionId(InvalidTransactionId);
+#endif
+
 	/*
 	 * done with abort processing, set current transaction state back to
 	 * default
@@ -4004,6 +4120,10 @@ PushTransaction(void)
 	 * We can now stack a minimally valid subtransaction without fear of
 	 * failure.
 	 */
+#ifdef PGXC  /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+		s->globalTransactionId = InvalidGlobalTransactionId;
+#endif
 	s->transactionId = InvalidTransactionId;	/* until assigned */
 	s->subTransactionId = currentSubTransactionId;
 	s->parent = p;
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 5a0f852b6f..969d6f566c 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.250 2009/02/18 15:58:41 heikki Exp $
@@ -42,6 +43,10 @@
 #include "utils/ps_status.h"
 #include "utils/tqual.h"
 
+#ifdef PGXC
+#include "pgxc/poolmgr.h"
+#endif
+
 extern int	optind;
 extern char *optarg;
 
@@ -329,6 +334,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 
 		switch (auxType)
 		{
+#ifdef PGXC /* PGXC_COORD */
+			case PoolerProcess:
+				statmsg = "pooler process";
+				break;
+#endif
 			case StartupProcess:
 				statmsg = "startup process";
 				break;
@@ -402,6 +412,13 @@ AuxiliaryProcessMain(int argc, char *argv[])
 
 	switch (auxType)
 	{
+#ifdef PGXC /* PGXC_COORD */
+		case PoolerProcess:
+			/* don't set signals, pool manager has its own agenda */
+			PoolManagerInit();
+			proc_exit(1);		/* should never return */
+#endif
+
 		case CheckerProcess:
 			bootstrap_signals();
 			CheckerModeMain();
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index ed06048894..2693b426b1 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -12,8 +12,8 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
        pg_aggregate.o pg_constraint.o pg_conversion.o pg_depend.o pg_enum.o \
-       pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o \
-       pg_shdepend.o pg_type.o storage.o toasting.o
+       pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o pg_shdepend.o \
+       pg_type.o pgxc_class.o storage.o toasting.o
 
 BKIFILES = postgres.bki postgres.description postgres.shdescription
 
@@ -37,6 +37,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\
 	pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
 	pg_ts_parser.h pg_ts_template.h \
 	pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
+	pgxc_class.h \
 	toasting.h indexing.h \
     )
 
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 8181cae64a..2932bffd1d 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.89 2009/06/11 14:48:54 momjian Exp $
@@ -50,6 +51,9 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#endif
 #include "commands/comment.h"
 #include "commands/dbcommands.h"
 #include "commands/defrem.h"
@@ -144,6 +148,9 @@ static const Oid object_classes[MAX_OCLASS] = {
 	AuthIdRelationId,			/* OCLASS_ROLE */
 	DatabaseRelationId,			/* OCLASS_DATABASE */
 	TableSpaceRelationId		/* OCLASS_TBLSPACE */
+#ifdef PGXC
+	,PgxcClassRelationId		/* OCLASS_PGXCCLASS */
+#endif
 };
 
 
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index f4cf829b46..4f14113c3b 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -68,6 +69,11 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#include "pgxc/locator.h"
+#endif
+
 
 static void AddNewRelationTuple(Relation pg_class_desc,
 					Relation new_rel_desc,
@@ -775,6 +781,141 @@ AddNewRelationTuple(Relation pg_class_desc,
 	InsertPgClassTuple(pg_class_desc, new_rel_desc, new_rel_oid, reloptions);
 }
 
+#ifdef PGXC
+/* --------------------------------
+ *		AddRelationDistribution 
+ *
+ *		Add to pgxc_class table
+ * --------------------------------
+ */
+void 
+AddRelationDistribution (Oid relid, 
+				DistributeBy *distributeby,
+				List 		 *parentOids,
+				TupleDesc	 descriptor)
+{
+	char locatortype 	= '\0';
+	int hashalgorithm 	= 0;
+	int hashbuckets 	= 0;
+	AttrNumber attnum 	= 0;
+
+
+	if (!distributeby)
+	{
+		/* 
+		 * No distribution specified.
+		 * See if we are a child table, and get distribution information
+		 * from there.
+		 */
+		if (list_length(parentOids) > 1)
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("Cannot currently distribute a table with more than one parent.")));
+		}
+		else if (list_length(parentOids) == 1)
+		{
+			/* 
+			 * Use parent's distribution
+			 */
+			int parentOid;
+			RelationLocInfo *rel_loc_info;
+
+			parentOid = linitial_oid(parentOids);
+			rel_loc_info = GetRelationLocInfo(parentOid);
+			locatortype = rel_loc_info->locatorType;
+
+			switch (locatortype)
+			{
+				case LOCATOR_TYPE_HASH:
+					attnum = rel_loc_info->partAttrNum;
+					break;
+
+				case LOCATOR_TYPE_REPLICATED:
+				case LOCATOR_TYPE_RROBIN:
+					break;
+
+				default:
+					ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+						 errmsg("Invalid parent table distribution type")));
+					break;
+			}
+		} else
+		{
+			/* 
+			 * If no distribution was specified, and we have not chosen
+			 * one based on primary key or foreign key, use first column with
+			 * a supported data type.
+			 */
+			Form_pg_attribute attr;
+			int i;
+
+			locatortype = LOCATOR_TYPE_HASH;
+
+			for (i = 0; i < descriptor->natts; i++)
+			{
+				attr = descriptor->attrs[i];
+				if (IsHashDistributable(attr->atttypid))
+				{
+					/* distribute on this column */
+					attnum = i + 1;
+					break;
+				}
+			}
+
+			/* If we did not find a usable type, fall back to round robin */
+			if (attnum == 0)
+				locatortype = LOCATOR_TYPE_RROBIN;
+		}
+	} else 
+	{
+		/* 
+		 * User specified distribution type
+		 */
+		switch (distributeby->disttype)
+		{
+			case DISTTYPE_HASH:
+				/* User specified hash column, validate */
+				attnum = get_attnum(relid, distributeby->colname);
+				
+				if (!IsHashDistributable(descriptor->attrs[attnum-1]->atttypid)) 
+				{
+					ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("Column %s is not a hash distributable data type", 
+							distributeby->colname)));
+				}
+				locatortype = LOCATOR_TYPE_HASH;
+				break;
+
+			case DISTTYPE_REPLICATION:
+				locatortype = LOCATOR_TYPE_REPLICATED;
+				break;
+
+			case DISTTYPE_ROUNDROBIN:
+				locatortype = LOCATOR_TYPE_RROBIN;
+				break;
+
+			default:
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("Invalid distribution type")));
+		}
+	}
+
+	if (locatortype == LOCATOR_TYPE_HASH)
+	{
+		/* PGXCTODO */
+		/* Use these for now until we make allowing different algorithms more flexible */
+		hashalgorithm = 1;
+		hashbuckets = HASH_SIZE;
+	}
+
+	PgxcClassCreate (relid, locatortype, attnum, hashalgorithm, hashbuckets);
+}
+#endif
+
 
 /* --------------------------------
  *		AddNewRelationType -
diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c
new file mode 100644
index 0000000000..a77f242357
--- /dev/null
+++ b/src/backend/catalog/pgxc_class.c
@@ -0,0 +1,105 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgxc_class.c
+ *	routines to support manipulation of the pgxc_class relation
+ *
+ * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pgxc_class.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "pgxc/locator.h"
+
+void PgxcClassCreate(Oid pcrelid,
+					char  pclocatortype,
+					int pcattnum,
+					int pchashalgorithm,
+					int pchashbuckets)
+{
+	Relation pgxcclassrel;
+	HeapTuple  htup;
+	bool	   nulls[Natts_pgxc_class];
+	Datum	  values[Natts_pgxc_class];
+	int		i;
+
+	/* Iterate through edb_linkauth attributes initializing nulls and values */
+	for (i = 0; i < Natts_pgxc_class; i++)
+	{
+		nulls[i]  = false;
+		values[i] = (Datum) 0;
+	}
+	
+	/* should not happen */
+	if(pcrelid == InvalidOid)
+	{
+		elog(ERROR,"pgxc class relid invalid.");
+		return;
+	}
+
+	values[Anum_pgxc_class_pcrelid - 1]   = ObjectIdGetDatum(pcrelid);	
+	values[Anum_pgxc_class_pclocatortype - 1] = ObjectIdGetDatum(pclocatortype);
+
+	if (pclocatortype == LOCATOR_TYPE_HASH)
+	{
+		values[Anum_pgxc_class_pcattnum - 1] = ObjectIdGetDatum(pcattnum);
+		values[Anum_pgxc_class_pchashalgorithm - 1] = ObjectIdGetDatum(pchashalgorithm);
+		values[Anum_pgxc_class_pchashbuckets - 1] = ObjectIdGetDatum(pchashbuckets);
+	} 
+
+	/* Open the edb_linkauth relation for insertion */
+	pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+
+	htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls);
+
+	(void) simple_heap_insert(pgxcclassrel, htup);
+		
+	CatalogUpdateIndexes(pgxcclassrel, htup);
+
+	heap_close(pgxcclassrel, RowExclusiveLock);
+}
+
+#ifdef PGXC
+/*
+ * RemovePGXCClass():
+ * 
+ * Remove extended PGXC information
+ *
+ * arg1: Oid of the relation.
+ *
+ */
+void RemovePgxcClass(Oid pcrelid)
+{
+	Relation  relation;
+	HeapTuple tup;
+
+	/*
+	 * Delete the pgxc_class tuple.
+	 */
+	relation = heap_open(PgxcClassRelationId, RowExclusiveLock);
+	tup = SearchSysCache(PGXCCLASSRELID,
+						 ObjectIdGetDatum(pcrelid),
+						 0, 0, 0);
+
+	if (!HeapTupleIsValid(tup)) /* should not happen */
+		elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+
+	simple_heap_delete(relation, &tup->t_self);
+
+	ReleaseSysCache(tup);
+
+	heap_close(relation, RowExclusiveLock);
+}
+#endif  /* PGXC */
+
+
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index c6a01f5b75..e0005905ba 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -37,6 +38,10 @@
 #include "parser/parse_coerce.h"
 #include "parser/parse_func.h"
 #include "parser/parsetree.h"
+#ifdef PGXC
+#include "parser/parse_utilcmd.h"
+#include "pgxc/pgxc.h"
+#endif
 #include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -404,6 +409,30 @@ DefineIndex(RangeVar *heapRelation,
 
 	(void) index_reloptions(amoptions, reloptions, true);
 
+#ifdef PGXC
+	/* Make sure we can locally enforce the index */
+	if (IS_PGXC_COORDINATOR && (primary || unique))
+	{
+		ListCell *elem;
+		bool isSafe = false;
+
+		foreach(elem, attributeList)
+		{
+			IndexElem  *key = (IndexElem *) lfirst(elem);
+
+			if (CheckLocalIndexColumn(rel->rd_locator_info->locatorType, 
+				rel->rd_locator_info->partAttrName, key->name))
+			{
+				isSafe = true;
+				break;
+			}
+		}
+		if (!isSafe)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+					errmsg("Unique index of partitioned table must contain the hash distribution column.")));
+	}
+#endif
 	/*
 	 * Prepare arguments for index_create, primarily an IndexInfo structure.
 	 * Note that ii_Predicate must be in implicit-AND format.
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index e6c75ab014..a187afa8f2 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -35,6 +36,12 @@
 #include "utils/resowner.h"
 #include "utils/syscache.h"
 
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+/* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#include "access/gtm.h"
+#endif
 
 /*
  * We don't want to log each fetching of a value from a sequence,
@@ -117,6 +124,13 @@ DefineSequence(CreateSeqStmt *seq)
 	bool		null[SEQ_COL_LASTCOL];
 	int			i;
 	NameData	name;
+#ifdef PGXC /* PGXC_COORD */
+	GTM_Sequence	start_value = 1;
+	GTM_Sequence	min_value = 1;
+	GTM_Sequence	max_value = InvalidSequenceValue;
+	GTM_Sequence	increment = 1;
+	bool		cycle = false;
+#endif
 
 	/* Check and set all option values */
 	init_params(seq->options, true, &new, &owned_by);
@@ -155,21 +169,33 @@ DefineSequence(CreateSeqStmt *seq)
 				coldef->typename = makeTypeNameFromOid(INT8OID, -1);
 				coldef->colname = "start_value";
 				value[i - 1] = Int64GetDatumFast(new.start_value);
+#ifdef PGXC /* PGXC_COORD */
+				start_value = new.start_value;
+#endif
 				break;
 			case SEQ_COL_INCBY:
 				coldef->typename = makeTypeNameFromOid(INT8OID, -1);
 				coldef->colname = "increment_by";
 				value[i - 1] = Int64GetDatumFast(new.increment_by);
+#ifdef PGXC /* PGXC_COORD */
+				increment = new.increment_by;
+#endif
 				break;
 			case SEQ_COL_MAXVALUE:
 				coldef->typename = makeTypeNameFromOid(INT8OID, -1);
 				coldef->colname = "max_value";
 				value[i - 1] = Int64GetDatumFast(new.max_value);
+#ifdef PGXC /* PGXC_COORD */
+				max_value = new.max_value;
+#endif
 				break;
 			case SEQ_COL_MINVALUE:
 				coldef->typename = makeTypeNameFromOid(INT8OID, -1);
 				coldef->colname = "min_value";
 				value[i - 1] = Int64GetDatumFast(new.min_value);
+#ifdef PGXC /* PGXC_COORD */
+				min_value = new.min_value;
+#endif
 				break;
 			case SEQ_COL_CACHE:
 				coldef->typename = makeTypeNameFromOid(INT8OID, -1);
@@ -185,6 +211,9 @@ DefineSequence(CreateSeqStmt *seq)
 				coldef->typename = makeTypeNameFromOid(BOOLOID, -1);
 				coldef->colname = "is_cycled";
 				value[i - 1] = BoolGetDatum(new.is_cycled);
+#ifdef PGXC  /* PGXC_COORD */
+				cycle = new.is_cycled;
+#endif
 				break;
 			case SEQ_COL_CALLED:
 				coldef->typename = makeTypeNameFromOid(BOOLOID, -1);
@@ -308,6 +337,20 @@ DefineSequence(CreateSeqStmt *seq)
 		process_owned_by(rel, owned_by);
 
 	heap_close(rel, NoLock);
+
+#ifdef PGXC  /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* We also need to create it on the GTM */
+		if (CreateSequenceGTM(name.data, increment, min_value, max_value, 
+				start_value, cycle) < 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("GTM error, could not create sequence")));
+		}
+	}
+#endif
 }
 
 /*
@@ -481,6 +524,20 @@ nextval_internal(Oid relid)
 	seq = read_info(elm, seqrel, &buf);
 	page = BufferGetPage(buf);
 
+#ifdef PGXC  /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Above, we still use the page as a locking mechanism to handle
+	   	 * concurrency
+		 */
+		result = (int64) GetNextValGTM(RelationGetRelationName(seqrel));
+		if (result < 0)
+			ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				 errmsg("GTM error, could not obtain sequence value")));
+	} else
+	{
+#endif
 	last = next = result = seq->last_value;
 	incby = seq->increment_by;
 	maxv = seq->max_value;
@@ -636,7 +693,9 @@ nextval_internal(Oid relid)
 	seq->log_cnt = log;			/* how much is logged */
 
 	END_CRIT_SECTION();
-
+#ifdef PGXC  /* PGXC_COORD */
+	}
+#endif
 	UnlockReleaseBuffer(buf);
 
 	relation_close(seqrel, NoLock);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index f22e7be5d8..3372883714 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -76,6 +77,10 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+#endif
 
 /*
  * ON COMMIT action list
@@ -523,6 +528,18 @@ DefineRelation(CreateStmt *stmt, char relkind)
 	 */
 	CommandCounterIncrement();
 
+#ifdef PGXC
+	/*
+	 * Add to pgxc_class.
+	 * we need to do this after CommandCounterIncrement 
+	 */
+	if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION)
+	{
+		AddRelationDistribution (relationId, stmt->distributeby, inheritOids, descriptor);
+		CommandCounterIncrement();
+	}
+#endif
+
 	/*
 	 * Open the new relation and acquire exclusive lock on it.	This isn't
 	 * really necessary for locking out other backends (since they can't see
@@ -739,6 +756,16 @@ RemoveRelations(DropStmt *drop)
 
 		add_exact_object_address(&obj, objects);
 
+
+#ifdef PGXC  /* PGXC_COORD */
+		/* PGXCTODO: allow the ability to rollback dropping sequences. */
+
+		/* Drop the sequence */
+		if (IS_PGXC_COORDINATOR && classform->relkind == RELKIND_SEQUENCE)
+		{
+			DropSequenceGTM(rel->relname);	
+		}
+#endif
 		ReleaseSysCache(tuple);
 	}
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 732f6d09c3..aed98d98f8 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -10,6 +10,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -57,6 +58,9 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
 
 /*
  * GUC parameters
@@ -899,6 +903,18 @@ vac_update_datfrozenxid(void)
 	if (dirty)
 	{
 		database_file_update_needed();
+		/*
+		 * vac_truncate_clog needs a transaction id to detect wrap-arounds. For
+		 * a autovacuum, this would require the data node to contact the GTM or
+		 * the coordinator and acquire GXID for the vacuum operation.
+		 *
+		 * To avoid this complexity, we disable the CLOG truncation. This is
+		 * perfectly fine for the prototype because we are not handling GXID
+		 * wrap-around in the prototype anyways. In future, this should be
+		 * fixed either by acquiring GXID for the vacuum operation or by
+		 * modifying the wrap-around check logic such that it does not need a
+		 * GXID
+		 */
 		vac_truncate_clog(newFrozenXid);
 	}
 }
@@ -1026,7 +1042,8 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
 
 	if (scanned_all)
 		*scanned_all = false;
-
+#ifndef PGXC
+	/* In PG-XC, do these after setting vacuum flags */
 	/* Begin a transaction for vacuuming this relation */
 	StartTransactionCommand();
 
@@ -1035,6 +1052,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
 	 * ensures that RecentGlobalXmin is kept truly recent.
 	 */
 	PushActiveSnapshot(GetTransactionSnapshot());
+#endif
 
 	if (!vacstmt->full)
 	{
@@ -1065,6 +1083,19 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
 		LWLockRelease(ProcArrayLock);
 	}
 
+#ifdef PGXC
+	elog (DEBUG1, "Starting vacuum transaction");
+	/* In PG-XC, do these after setting vacuum flags */
+	/* Begin a transaction for vacuuming this relation */
+	StartTransactionCommand();
+	elog (DEBUG1, "Started vacuum transaction");
+
+	/*
+	 * Functions in indexes may want a snapshot set.  Also, setting
+	 * a snapshot ensures that RecentGlobalXmin is kept truly recent.
+	 */
+	PushActiveSnapshot(GetTransactionSnapshot());
+#endif
 	/*
 	 * Check for user-requested abort.	Note we want this to be inside a
 	 * transaction, so xact.c doesn't issue useless WARNING.
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 72c9877ffd..895e65e121 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -13,6 +13,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.432 2009/06/18 01:27:02 tgl Exp $
@@ -2402,6 +2403,19 @@ _copyCopyStmt(CopyStmt *from)
 	return newnode;
 }
 
+#ifdef PGXC
+static DistributeBy *
+_copyDistributeBy(DistributeBy *from)
+{
+	DistributeBy *newnode = makeNode(DistributeBy);
+
+	COPY_SCALAR_FIELD(disttype);
+	COPY_STRING_FIELD(colname);
+
+	return newnode;
+}
+#endif
+
 static CreateStmt *
 _copyCreateStmt(CreateStmt *from)
 {
@@ -2414,6 +2428,9 @@ _copyCreateStmt(CreateStmt *from)
 	COPY_NODE_FIELD(options);
 	COPY_SCALAR_FIELD(oncommit);
 	COPY_STRING_FIELD(tablespacename);
+#ifdef PGXC
+	COPY_NODE_FIELD(distributeby);
+#endif
 
 	return newnode;
 }
@@ -4093,7 +4110,11 @@ copyObject(void *from)
 		case T_XmlSerialize:
 			retval = _copyXmlSerialize(from);
 			break;
-
+#ifdef PGXC
+		case T_DistributeBy:
+			retval = _copyDistributeBy(from);
+			break;
+#endif
 		default:
 			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(from));
 			retval = from;		/* keep compiler quiet */
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 041b96971c..fedb5102bb 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -20,6 +20,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.355 2009/06/18 01:27:02 tgl Exp $
@@ -1078,6 +1079,9 @@ _equalCreateStmt(CreateStmt *a, CreateStmt *b)
 	COMPARE_NODE_FIELD(options);
 	COMPARE_SCALAR_FIELD(oncommit);
 	COMPARE_STRING_FIELD(tablespacename);
+#ifdef PGXC
+	COMPARE_NODE_FIELD(distributeby);
+#endif
 
 	return true;
 }
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index b27cd513a5..98d3c4c9ef 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -1154,6 +1155,22 @@ _readRangeTblEntry(void)
 	READ_DONE();
 }
 
+#ifdef PGXC
+/*
+ * _readDistributeBy
+ */
+static DistributeBy *
+_readDistributeBy(void)
+{
+	READ_LOCALS(DistributeBy);
+
+	READ_ENUM_FIELD(disttype, DistributionType);
+	READ_STRING_FIELD(colname);
+
+	READ_DONE();
+}
+#endif
+
 
 /*
  * parseNodeString
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 280443074f..9ffada513a 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -8,6 +8,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -58,6 +59,7 @@
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "parser/gramparse.h"
+#include "pgxc/poolmgr.h"
 #include "storage/lmgr.h"
 #include "utils/date.h"
 #include "utils/datetime.h"
@@ -179,6 +181,9 @@ static TypeName *TableFuncTypeName(List *columns);
 
 	InsertStmt			*istmt;
 	VariableSetStmt		*vsetstmt;
+/* PGXC_BEGIN */
+	DistributeBy		*distby;
+/* PGXC_END */
 }
 
 %type <node>	stmt schema_stmt
@@ -197,7 +202,7 @@ static TypeName *TableFuncTypeName(List *columns);
 		DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt
 		DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt
 		DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt
-		DropForeignServerStmt DropUserMappingStmt ExplainStmt FetchStmt
+		DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
 		GrantStmt GrantRoleStmt IndexStmt InsertStmt ListenStmt LoadStmt
 		LockStmt NotifyStmt ExplainableStmt PreparableStmt
 		CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt
@@ -250,7 +255,7 @@ static TypeName *TableFuncTypeName(List *columns);
 
 %type <str>		relation_name copy_file_name
 				database_name access_method_clause access_method attr_name
-				index_name name file_name cluster_index_specification
+				index_name name file_name cluster_index_specification 
 
 %type <list>	func_name handler_name qual_Op qual_all_Op subquery_Op
 				opt_class opt_validator validator_clause
@@ -323,6 +328,9 @@ static TypeName *TableFuncTypeName(List *columns);
 %type <boolean> opt_freeze opt_default opt_recheck
 %type <defelt>	opt_binary opt_oids copy_delimiter
 
+%type <list>	node_list
+%type <str>		DirectStmt
+
 %type <boolean> copy_from
 
 %type <ival>	opt_column event cursor_options opt_hold opt_set_data
@@ -415,6 +423,9 @@ static TypeName *TableFuncTypeName(List *columns);
 %type <windef>	window_definition over_clause window_specification
 %type <str>		opt_existing_window_name
 %type <ival>	opt_frame_clause frame_extent frame_bound
+/* PGXC_BEGIN */
+%type <distby>	OptDistributeBy
+/* PGXC_END */
 
 
 /*
@@ -425,6 +436,7 @@ static TypeName *TableFuncTypeName(List *columns);
  */
 
 /* ordinary key words in alphabetical order */
+/* PGXC - added REPLICATION, DISTRIBUTE, and HASH */
 %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER
 	AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC
 	ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION
@@ -436,14 +448,17 @@ static TypeName *TableFuncTypeName(List *columns);
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
 	CLUSTER COALESCE COLLATE COLUMN COMMENT COMMIT
 	COMMITTED CONCURRENTLY CONFIGURATION CONNECTION CONSTRAINT CONSTRAINTS
-	CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE CREATEDB
+	CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE CREATEDB
 	CREATEROLE CREATEUSER CROSS CSV CURRENT_P
 	CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA
 	CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE
 
 	DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS
 	DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DESC
-	DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P DOUBLE_P DROP
+/* PGXC_BEGIN */
+	DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTRIBUTE DO DOCUMENT_P DOMAIN_P DOUBLE_P 
+/* PGXC_END */
+	DROP
 
 	EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EXCEPT
 	EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTERNAL EXTRACT
@@ -453,7 +468,9 @@ static TypeName *TableFuncTypeName(List *columns);
 
 	GLOBAL GRANT GRANTED GREATEST GROUP_P
 
-	HANDLER HAVING HEADER_P HOLD HOUR_P
+/* PGXC_BEGIN */
+	HANDLER HASH HAVING HEADER_P HOLD HOUR_P
+/* PGXC_END */
 
 	IDENTITY_P IF_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IN_P
 	INCLUDING INCREMENT INDEX INDEXES INHERIT INHERITS INITIALLY
@@ -471,7 +488,7 @@ static TypeName *TableFuncTypeName(List *columns);
 	MAPPING MATCH MAXVALUE MINUTE_P MINVALUE MODE MONTH_P MOVE
 
 	NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NOCREATEDB
-	NOCREATEROLE NOCREATEUSER NOINHERIT NOLOGIN_P NONE NOSUPERUSER
+	NOCREATEROLE NOCREATEUSER NODE NOINHERIT NOLOGIN_P NONE NOSUPERUSER
 	NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC
 
 	OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
@@ -484,8 +501,10 @@ static TypeName *TableFuncTypeName(List *columns);
 	QUOTE
 
 	RANGE READ REAL REASSIGN RECHECK RECURSIVE REFERENCES REINDEX
-	RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART
-	RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE
+/* PGXC_BEGIN */
+	RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA REPLICATION RESET RESTART
+	RESTRICT RETURNING RETURNS REVOKE RIGHT ROBIN ROLE ROLLBACK ROUND ROW ROWS RULE
+/* PGXC_END */
 
 	SAVEPOINT SCHEMA SCROLL SEARCH SECOND_P SECURITY SELECT SEQUENCE
 	SERIALIZABLE SERVER SESSION SESSION_USER SET SETOF SHARE
@@ -668,6 +687,7 @@ stmt :
 			| DropUserMappingStmt
 			| DropdbStmt
 			| ExecuteStmt
+			| ExecDirectStmt
 			| ExplainStmt
 			| FetchStmt
 			| GrantStmt
@@ -2036,7 +2056,10 @@ opt_using:
  *****************************************************************************/
 
 CreateStmt:	CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
-			OptInherit OptWith OnCommitOption OptTableSpace
+			OptInherit OptWith OnCommitOption OptTableSpace 
+/* PGXC_BEGIN */
+			OptDistributeBy
+/* PGXC_END */
 				{
 					CreateStmt *n = makeNode(CreateStmt);
 					$4->istemp = $2;
@@ -2047,10 +2070,21 @@ CreateStmt:	CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
 					n->options = $9;
 					n->oncommit = $10;
 					n->tablespacename = $11;
+					n->distributeby = $12;
+/* PGXC_BEGIN */
+					if (n->inhRelations != NULL && n->distributeby != NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								 errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+								 scanner_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
 					$$ = (Node *)n;
 				}
 		| CREATE OptTemp TABLE qualified_name OF qualified_name
-			'(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace
+			'(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace 
+/* PGXC_BEGIN */
+			OptDistributeBy
+/* PGXC_END */
 				{
 					/* SQL99 CREATE TABLE OF <UDT> (cols) seems to be satisfied
 					 * by our inheritance capabilities. Let's try it...
@@ -2064,6 +2098,14 @@ CreateStmt:	CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
 					n->options = $10;
 					n->oncommit = $11;
 					n->tablespacename = $12;
+					n->distributeby = $13;
+/* PGXC_BEGIN */
+					if (n->inhRelations != NULL && n->distributeby != NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								 errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+								 scanner_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
 					$$ = (Node *)n;
 				}
 		;
@@ -2495,6 +2537,36 @@ OptTableSpace:   TABLESPACE name					{ $$ = $2; }
 			| /*EMPTY*/								{ $$ = NULL; }
 		;
 
+/* PGXC_BEGIN */
+DistributeByHash: DISTRIBUTE BY
+			| DISTRIBUTE BY HASH
+		;
+
+OptDistributeBy: DistributeByHash '(' name ')'
+				{
+					DistributeBy *n = makeNode(DistributeBy);
+					n->disttype = DISTTYPE_HASH;
+					n->colname = $3;
+					$$ = n;
+				}
+			| DISTRIBUTE BY REPLICATION
+				{
+					DistributeBy *n = makeNode(DistributeBy);
+					n->disttype = DISTTYPE_REPLICATION;
+					n->colname = NULL;
+					$$ = n;
+				}
+			| DISTRIBUTE BY ROUND ROBIN
+				{
+					DistributeBy *n = makeNode(DistributeBy);
+					n->disttype = DISTTYPE_ROUNDROBIN;
+					n->colname = NULL;
+					$$ = n;
+				}
+			| /*EMPTY*/								{ $$ = NULL; }
+		;
+/* PGXC_END */
+
 OptConsTableSpace:   USING INDEX TABLESPACE name	{ $$ = $4; }
 			| /*EMPTY*/								{ $$ = NULL; }
 		;
@@ -6461,6 +6533,47 @@ opt_analyze:
 /*****************************************************************************
  *
  *		QUERY:
+ *				EXECUTE DIRECT ON (COORDINATOR | NODE num, ...) query
+ *
+ *****************************************************************************/
+
+ExecDirectStmt: EXECUTE DIRECT ON COORDINATOR DirectStmt
+				{
+					ExecDirectStmt *n = makeNode(ExecDirectStmt);
+					n->coordinator = TRUE;
+					n->nodes = NIL;
+					n->query = $5;
+					$$ = (Node *)n;
+				}
+				| EXECUTE DIRECT ON NODE node_list DirectStmt
+				{
+					ExecDirectStmt *n = makeNode(ExecDirectStmt);
+					n->coordinator = FALSE;
+					n->nodes = $5;
+					n->query = $6;
+					$$ = (Node *)n;
+				}
+		;
+
+DirectStmt:
+			Sconst					/* by default all are $$=$1 */
+		;
+
+node_list: 
+		 	Iconst					{ $$ = list_make1(makeInteger($1)); }
+			| node_list ',' Iconst	{ $$ = lappend($1, makeInteger($3)); }
+			| '*'
+				{
+					int i;
+					$$ = NIL;
+					for (i=1; i<=NumDataNodes; i++)
+						$$ = lappend($$, makeInteger(i));
+				}
+		;
+
+/*****************************************************************************
+ *
+ *		QUERY:
  *				PREPARE <plan_name> [(args, ...)] AS <query>
  *
  *****************************************************************************/
@@ -10117,6 +10230,7 @@ ColLabel:	IDENT									{ $$ = $1; }
 
 /* "Unreserved" keywords --- available for use as any kind of name.
  */
+/* PGXC - added DISTRIBUTE, HASH, REPLICATION */
 unreserved_keyword:
 			  ABORT_P
 			| ABSOLUTE_P
@@ -10157,6 +10271,7 @@ unreserved_keyword:
 			| CONTENT_P
 			| CONTINUE_P
 			| CONVERSION_P
+			| COORDINATOR
 			| COPY
 			| COST
 			| CREATEDB
@@ -10178,8 +10293,12 @@ unreserved_keyword:
 			| DELIMITER
 			| DELIMITERS
 			| DICTIONARY
+			| DIRECT
 			| DISABLE_P
 			| DISCARD
+/* PGXC_BEGIN */
+			| DISTRIBUTE
+/* PGXC_END */
 			| DOCUMENT_P
 			| DOMAIN_P
 			| DOUBLE_P
@@ -10204,6 +10323,9 @@ unreserved_keyword:
 			| GLOBAL
 			| GRANTED
 			| HANDLER
+/* PGXC_BEGIN */
+			| HASH
+/* PGXC_END */
 			| HEADER_P
 			| HOLD
 			| HOUR_P
@@ -10253,6 +10375,7 @@ unreserved_keyword:
 			| NOCREATEDB
 			| NOCREATEROLE
 			| NOCREATEUSER
+			| NODE
 			| NOINHERIT
 			| NOLOGIN_P
 			| NOSUPERUSER
@@ -10294,13 +10417,22 @@ unreserved_keyword:
 			| REPEATABLE
 			| REPLACE
 			| REPLICA
+/* PGXC_BEGIN */
+			| REPLICATION
+/* PGXC_END */
 			| RESET
 			| RESTART
 			| RESTRICT
 			| RETURNS
 			| REVOKE
+/* PGXC_BEGIN */
+			| ROBIN
+/* PGXC_END */
 			| ROLE
 			| ROLLBACK
+/* PGXC_BEGIN */
+			| ROUND
+/* PGXC_END */
 			| ROWS
 			| RULE
 			| SAVEPOINT
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index e5a3621cce..1336e00a45 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -18,6 +18,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *	$PostgreSQL: pgsql/src/backend/parser/parse_utilcmd.c,v 2.21 2009/06/11 14:49:00 momjian Exp $
  *
@@ -48,6 +49,11 @@
 #include "parser/parse_relation.h"
 #include "parser/parse_type.h"
 #include "parser/parse_utilcmd.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#include "pgxc/pgxc.h"
+#endif
+
 #include "rewrite/rewriteManip.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@@ -75,6 +81,10 @@ typedef struct
 	List	   *alist;			/* "after list" of things to do after creating
 								 * the table */
 	IndexStmt  *pkey;			/* PRIMARY KEY index, if any */
+#ifdef PGXC
+	char	  *fallback_dist_col;	/* suggested column to distribute on */
+	DistributeBy *distributeby; /* original distribute by column in create table */
+#endif
 } CreateStmtContext;
 
 /* State shared by transformCreateSchemaStmt and its subroutines */
@@ -114,7 +124,9 @@ static void transformFKConstraints(ParseState *pstate,
 static void transformConstraintAttrs(List *constraintList);
 static void transformColumnType(ParseState *pstate, ColumnDef *column);
 static void setSchemaName(char *context_schema, char **stmt_schema_name);
-
+#ifdef PGXC
+static void checkLocalFKConstraints(CreateStmtContext *cxt);
+#endif
 
 /*
  * transformCreateStmt -
@@ -177,6 +189,10 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
 	cxt.alist = NIL;
 	cxt.pkey = NULL;
 	cxt.hasoids = interpretOidsOption(stmt->options);
+#ifdef PGXC
+	cxt.fallback_dist_col = NULL;
+	cxt.distributeby = stmt->distributeby;
+#endif
 
 	/*
 	 * Run through each primary element in the table creation clause. Separate
@@ -244,6 +260,18 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
 	result = list_concat(result, cxt.alist);
 	result = list_concat(result, save_alist);
 
+#ifdef PGXC
+	/* 
+	 * If the user did not specify any distribution clause and there is no 
+	 * inherits clause, try and use PK or unique index 
+	 */
+	if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col)
+	{
+		stmt->distributeby = (DistributeBy *) palloc0(sizeof(DistributeBy));
+		stmt->distributeby->disttype = DISTTYPE_HASH;
+		stmt->distributeby->colname = cxt.fallback_dist_col;
+	}
+#endif
 	return result;
 }
 
@@ -307,7 +335,7 @@ transformColumnDefinition(ParseState *pstate, CreateStmtContext *cxt,
 		char	   *snamespace;
 		char	   *sname;
 		char	   *qstring;
-		A_Const    *snamenode;
+		A_Const	   *snamenode;
 		TypeCast   *castnode;
 		FuncCall   *funccallnode;
 		CreateSeqStmt *seqstmt;
@@ -1061,6 +1089,7 @@ transformIndexConstraints(ParseState *pstate, CreateStmtContext *cxt)
 	}
 }
 
+
 /*
  * transformIndexConstraint
  *		Transform one UNIQUE or PRIMARY KEY constraint for
@@ -1072,6 +1101,10 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 	IndexStmt  *index;
 	ListCell   *keys;
 	IndexElem  *iparam;
+#ifdef PGXC
+		bool		isLocalSafe = false;
+#endif
+
 
 	index = makeNode(IndexStmt);
 
@@ -1126,6 +1159,22 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 			if (strcmp(column->colname, key) == 0)
 			{
 				found = true;
+
+#ifdef PGXC
+				/*
+			 	 * Only allow locally enforceable constraints.
+				 * See if it is a distribution column
+				 * If not set, set it to first column in index.
+				 * If primary key, we prefer that over a unique constraint.
+				 */
+			   if (IS_PGXC_COORDINATOR && !isLocalSafe)
+			   {
+					if (cxt->distributeby)
+						isLocalSafe = CheckLocalIndexColumn (
+								ConvertToLocatorType(cxt->distributeby->disttype), 
+								cxt->distributeby->colname, key);
+			   }
+#endif
 				break;
 			}
 		}
@@ -1219,6 +1268,27 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 			}
 		}
 
+#ifdef PGXC
+		if (IS_PGXC_COORDINATOR)
+		{
+			/*
+			 * Set fallback distribution column.
+			 * If not set, set it to first column in index. 
+			 * If primary key, we prefer that over a unique constraint.
+			 */
+			if (index->indexParams == NIL
+					&& (index->primary || !cxt->fallback_dist_col))
+			{
+				cxt->fallback_dist_col = pstrdup(key);
+			}
+	
+			/* Existing table, check if it is safe */
+			if (!cxt->distributeby && !isLocalSafe)
+				isLocalSafe = CheckLocalIndexColumn (
+						cxt->rel->rd_locator_info->locatorType, cxt->rel->rd_locator_info->partAttrName, key);
+		}
+#endif
+
 		/* OK, add it to the index definition */
 		iparam = makeNode(IndexElem);
 		iparam->name = pstrdup(key);
@@ -1228,6 +1298,13 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 		iparam->nulls_ordering = SORTBY_NULLS_DEFAULT;
 		index->indexParams = lappend(index->indexParams, iparam);
 	}
+#ifdef PGXC
+		if (IS_PGXC_COORDINATOR && cxt->distributeby 
+				&& cxt->distributeby->disttype == DISTTYPE_HASH && !isLocalSafe)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+					errmsg("Unique index of partitioned table must contain the hash distribution column.")));
+#endif
 
 	return index;
 }
@@ -1256,9 +1333,34 @@ transformFKConstraints(ParseState *pstate, CreateStmtContext *cxt,
 			FkConstraint *fkconstraint = (FkConstraint *) lfirst(fkclist);
 
 			fkconstraint->skip_validation = true;
+#ifdef PGXC
+			/*
+			 * Set fallback distribution column.
+			 * If not yet set, set it to first column in FK constraint
+			 * if it references a partitioned table
+			 */
+			if (IS_PGXC_COORDINATOR && !cxt->fallback_dist_col)
+			{
+				Oid pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false);
+
+				/* make sure it is a partitioned column */
+				if (IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,0))))
+				{
+					/* take first column */
+					char *colstr = strdup(strVal(list_nth(fkconstraint->fk_attrs,0)));
+					cxt->fallback_dist_col = pstrdup(colstr);
+				}
+			}
+#endif
 		}
 	}
 
+#ifdef PGXC
+	/* Only allow constraints that are locally enforceable - no distributed ones */
+	if (IS_PGXC_COORDINATOR)
+		checkLocalFKConstraints(cxt);
+#endif
+
 	/*
 	 * For CREATE TABLE or ALTER TABLE ADD COLUMN, gin up an ALTER TABLE ADD
 	 * CONSTRAINT command to execute after the basic command is complete. (If
@@ -1714,6 +1816,10 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString)
 	cxt.blist = NIL;
 	cxt.alist = NIL;
 	cxt.pkey = NULL;
+#ifdef PGXC
+	cxt.fallback_dist_col = NULL;
+	cxt.distributeby = NULL;
+#endif
 
 	/*
 	 * The only subtypes that currently require parse transformation handling
@@ -2115,3 +2221,118 @@ setSchemaName(char *context_schema, char **stmt_schema_name)
 						"different from the one being created (%s)",
 						*stmt_schema_name, context_schema)));
 }
+
+#ifdef PGXC
+/*
+ * CheckLocalIndexColumn
+ *
+ * Checks whether or not the index can be safely enforced locally
+ */
+bool
+CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
+{
+
+	if (loctype == LOCATOR_TYPE_REPLICATED)
+		/* always safe */
+		return true;
+	if (loctype == LOCATOR_TYPE_RROBIN)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+					errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
+	else if (loctype == LOCATOR_TYPE_HASH)
+	{
+		if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
+			return true;
+	}
+	return false;
+}
+
+
+/*
+ * check to see if the constraint can be enforced locally
+ * if not, an error will be thrown
+ */
+void
+static checkLocalFKConstraints(CreateStmtContext *cxt)
+{
+	ListCell *fkclist;
+
+	foreach(fkclist, cxt->fkconstraints)
+	{
+		FkConstraint *fkconstraint;
+		Oid pk_rel_id;
+		char refloctype;
+		char *checkcolname = NULL;
+
+		fkconstraint = (FkConstraint *) lfirst(fkclist);
+		pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false);
+
+		refloctype = GetLocatorType(pk_rel_id);
+
+		/* If referenced table is replicated, the constraint is safe */
+		if (refloctype == LOCATOR_TYPE_REPLICATED)
+				continue;
+		else if (refloctype == LOCATOR_TYPE_RROBIN)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					errmsg("Cannot reference a round robin table in a foreign key constraint")));
+		}
+
+		/*
+		 * See if we are hash partitioned and the column appears in the
+		 * constraint, and it corresponds to the position in the referenced table.
+		 */
+		if (cxt->isalter)
+		{
+			if (cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_HASH)
+			{
+				checkcolname = cxt->rel->rd_locator_info->partAttrName;
+			}
+		}
+		else
+		{
+			if (cxt->distributeby)
+			{
+				if (cxt->distributeby->disttype == DISTTYPE_HASH)
+					checkcolname = cxt->distributeby->colname;
+			}
+			else
+			{
+				if (cxt->fallback_dist_col)
+					checkcolname = cxt->fallback_dist_col;
+			}
+		}
+
+		if (checkcolname)
+		{
+			int pos = 0;
+
+			ListCell *attritem;
+
+			foreach(attritem, fkconstraint->fk_attrs)
+			{
+				char *attrname = (char *) strVal(lfirst(attritem));
+
+				if (strcmp(cxt->rel->rd_locator_info->partAttrName, attrname) == 0)
+				{
+					/* Found the ordinal position in constraint */
+					break;
+				}
+				pos++;
+			}
+
+			if (pos >= list_length(fkconstraint->fk_attrs))
+				ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								errmsg("Hash distributed table must include distribution column in index")));
+
+			/* Verify that the referenced table is partitioned at the same position in the index */
+			if (!IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,pos))))
+				ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								errmsg("Hash distribution column does not refer to hash distribution column in referenced table.")));
+		}
+	}
+}
+#endif
diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile
new file mode 100644
index 0000000000..d978720b1c
--- /dev/null
+++ b/src/backend/pgxc/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the access methods module
+#
+#
+# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# $PostgreSQL$
+#
+
+subdir = src/backend/pgxc
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+SUBDIRS	    = locator plan pool
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/locator/Makefile b/src/backend/pgxc/locator/Makefile
new file mode 100644
index 0000000000..026a247940
--- /dev/null
+++ b/src/backend/pgxc/locator/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for locator
+#
+#
+# Copyright(C) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+#    $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/locator
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = locator.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
new file mode 100644
index 0000000000..995a64cb4a
--- /dev/null
+++ b/src/backend/pgxc/locator/locator.c
@@ -0,0 +1,607 @@
+/*-------------------------------------------------------------------------
+ *
+ * locator.c
+ *		Functions that help manage table location information such as
+ * partitioning and replication information.
+ *
+ *
+ * PGXCTODO - do not use a single mappingTable for all
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *		$$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "postgres.h"
+#include "access/skey.h"
+#include "access/relscan.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_type.h"
+#include "nodes/pg_list.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/tqual.h"
+#include "pgxc/poolmgr.h"
+#include "pgxc/locator.h"
+
+#include "catalog/pgxc_class.h"
+#include "catalog/namespace.h"
+
+
+/* PGXCTODO For prototype, relations use the same hash mapping table.
+ * Long term, make it a pointer in RelationLocInfo, and have
+ * similarly handled tables point to the same mapping table,
+ * to check faster for equivalency
+ */
+int			mappingTable[HASH_SIZE];
+
+bool		locatorInited = false;
+
+
+/* GUC parameter */
+char	   *PreferredDataNodes = NULL;
+
+/* Preferred to use when reading from replicated tables */
+static List *globalPreferredNodes = NIL;
+
+/*
+ * init_mapping_table - initializes a mapping table
+ *
+ * PGXCTODO
+ * For the prototype, all partitioned tables will use the same partition map.
+ * We cannot assume this long term
+ */
+static void
+init_mapping_table(int nodeCount, int mapTable[])
+{
+	int			i;
+
+	for (i = 0; i < HASH_SIZE; i++)
+	{
+		mapTable[i] = (i % nodeCount) + 1;
+	}
+}
+
+
+/*
+ * Pick any data node, but try a preferred node
+ *
+ */
+int
+GetAnyDataNode(void)
+{
+	/* try and pick from the preferred list */
+	if (globalPreferredNodes != NULL)
+		return linitial_int(globalPreferredNodes);
+
+	return 1;
+}
+
+
+/*
+ * hash_range - hash the key to a value between 0 and HASH_SIZE
+ *
+ * Note, this function corresponds to GridSQL hashing
+ * and is used here to allow us the wire up GridSQL
+ * to the same underlying nodes
+ */
+static int
+hash_range(char *key)
+{
+	int			i;
+	int			length;
+	int			value;
+
+	if (key == NULL || key == '\0')
+	{
+		return 0;
+	}
+
+	length = strlen(key);
+
+	value = 0x238F13AF * length;
+
+	for (i = 0; i < length; i++)
+	{
+		value = value + ((key[i] << i * 5 % 24) & 0x7fffffff);
+	}
+
+	return (1103515243 * value + 12345) % 65537 & HASH_MASK;
+}
+
+/*
+ * hash_range_int - hashes the integer key to a value between 0 and HASH_SIZE
+ *
+ * See hash_range
+ */
+static int
+hash_range_int(int intkey)
+{
+	char		int_str[13];	/* plenty for 32 bit int */
+
+	int_str[12] = '\0';
+	snprintf(int_str, 12, "%d", intkey);
+
+	return hash_range(int_str);
+}
+
+
+/*
+ * get_node_from_hash - determine node based on hash bucket
+ *
+ */
+static int
+get_node_from_hash(int hash)
+{
+	if (hash > HASH_SIZE || hash < 0)
+	{
+		ereport(ERROR, (errmsg("Hash value out of range\n")));
+	}
+
+	return mappingTable[hash];
+}
+
+
+/*
+ * Returns whether or not the data type is hash distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
+ */
+bool
+IsHashDistributable(Oid col_type)
+{
+	if (col_type == INT4OID || col_type == INT2OID)
+		return true;
+
+	return false;
+}
+
+
+/*
+ * get_hash_column - return hash column for relation.
+ *
+ * Returns NULL if the relation is not hash partitioned.
+ */
+char *
+GetRelationHashColumn(RelationLocInfo * rel_loc_info)
+{
+	char	   *column_str = NULL;
+
+	if (rel_loc_info == NULL)
+		column_str = NULL;
+	else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+		column_str = NULL;
+	else
+	{
+		int			len = strlen(rel_loc_info->partAttrName);
+
+		column_str = (char *) palloc(len + 1);
+		strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+	}
+
+	return column_str;
+}
+
+/*
+ * IsHashColumn - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name)
+{
+	bool		ret_value = false;
+
+	if (!rel_loc_info || !part_col_name)
+		ret_value = false;
+	else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+		ret_value = false;
+	else
+		ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+
+	return ret_value;
+}
+
+
+/*
+ * IsHashColumnForRelId - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumnForRelId(Oid relid, char *part_col_name)
+{
+	RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+	return IsHashColumn(rel_loc_info, part_col_name);
+}
+
+
+/**
+ * Update the round robin node for the relation
+ *
+ * PGXCTODO - may not want to bother with locking here, we could track
+ * these in the session memory context instead...
+ */
+int
+GetRoundRobinNode(Oid relid)
+{
+	int			ret_node;
+
+	Relation	rel = relation_open(relid, AccessShareLock);
+
+    Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED ||
+			rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+
+	ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
+
+	/* Move round robin indicator to next node */
+	if (rel->rd_locator_info->roundRobinNode->next != NULL)
+		rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
+	else
+		/* reset to first one */
+		rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head;
+
+	relation_close(rel, AccessShareLock);
+
+	return ret_node;
+}
+
+
+/*
+ * GetRelationNodes
+ *
+ * Get list of relation nodes for read operation.
+ * If the table is replicated and we are reading, we can just pick one.
+ * If the table is partitioned, we apply partitioning column value, if possible.
+ *
+ * If the relation is partitioned, partValue will be applied if present
+ * (indicating a value appears for partitioning column), otherwise it
+ * is ignored.
+ *
+ * preferredNodes is only used when for replicated tables. If set, it will
+ * use one of the nodes specified if the table is replicated on it.
+ * This helps optimize for avoiding introducing additional nodes into the
+ * transaction.
+ *
+ * The returned List is a copy, so it should be freed when finished.
+ */
+List *
+GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue, int isRead)
+{
+	ListCell   *prefItem;
+	ListCell   *stepItem;
+	List	   *destList = NULL;
+
+
+	if (rel_loc_info == NULL)
+		return NULL;
+
+	switch (rel_loc_info->locatorType)
+	{
+		case LOCATOR_TYPE_REPLICATED:
+
+			if (!isRead)
+				/* we need to write to all synchronously */
+				destList = list_copy(rel_loc_info->nodeList);
+			else
+			{
+				destList = NULL;
+
+				if (globalPreferredNodes != NULL)
+				{
+					/* try and pick from the preferred list */
+					foreach(prefItem, globalPreferredNodes)
+					/* make sure it is valid for this relation */
+						foreach(stepItem, rel_loc_info->nodeList)
+						if (lfirst_int(stepItem) == lfirst_int(prefItem))
+					{
+						destList = lappend_int(NULL, lfirst_int(prefItem));
+						break;
+					}
+				}
+			}
+
+			if (destList == NULL)
+			{
+				/*
+				 * read from just one of them
+				 * use round robin mechanism
+				 */
+				destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid));
+			}
+			break;
+
+		case LOCATOR_TYPE_HASH:
+
+			if (partValue != NULL)
+			{
+				/* in prototype, all partitioned tables use same map */
+				destList = lappend_int(NULL, get_node_from_hash(hash_range_int(*partValue)));
+			}
+			else
+			{
+				/*
+				 * No partitioning value passed in
+				 * (no where qualification on part column - use all)
+				 */
+				destList = list_copy(rel_loc_info->nodeList);
+			}
+			break;
+
+		case LOCATOR_TYPE_SINGLE:
+
+			/* just return first (there should only be one) */
+			destList = list_copy(rel_loc_info->nodeList);
+			break;
+
+		case LOCATOR_TYPE_RROBIN:
+
+			/* round robin, get next one */
+			if (isRead)
+			{
+				/* we need to read from all */
+				destList = list_copy(rel_loc_info->nodeList);
+			}
+			else
+			{
+				/* write to just one of them */
+				destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid));
+			}
+
+			break;
+
+			/* PGXCTODO case LOCATOR_TYPE_RANGE: */
+			/* PGXCTODO case LOCATOR_TYPE_CUSTOM: */
+		default:
+			ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
+								   rel_loc_info->locatorType)));
+			break;
+	}
+
+	return destList;
+}
+
+
+/*
+ * ConvertToLocatorType
+ *		get locator distribution type
+ * We really should just have pgxc_class use disttype instead...
+ */
+char
+ConvertToLocatorType(int disttype)
+{
+	char		loctype;
+
+	switch (disttype)
+	{
+		case DISTTYPE_HASH:
+			loctype = LOCATOR_TYPE_HASH;
+			break;
+		case DISTTYPE_ROUNDROBIN:
+			loctype = LOCATOR_TYPE_RROBIN;
+			break;
+		case DISTTYPE_REPLICATION:
+			loctype = LOCATOR_TYPE_REPLICATED;
+			break;
+		default:
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("Invalid distribution type")));
+			break;
+	}
+
+	return loctype;
+}
+
+
+/*
+ * GetLocatorType - Returns the locator type of the table
+ *
+ */
+char
+GetLocatorType(Oid relid)
+{
+	char		ret = '\0';
+
+	RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
+
+	if (ret_loc_info != NULL)
+		ret = ret_loc_info->locatorType;
+
+	return ret;
+}
+
+
+/*
+ * Return a list of all nodes.
+ * We assume all tables use all nodes in the prototype, so just return a list
+ * from first one.
+ */
+List *
+GetAllNodes(void)
+{
+	int			i;
+
+	/*
+	 * PGXCTODO - add support for having nodes on a subset of nodes
+	 * For now, assume on all nodes
+	 */
+	List	   *nodeList = NIL;
+
+	for (i = 1; i < NumDataNodes + 1; i++)
+	{
+		nodeList = lappend_int(nodeList, i);
+	}
+
+	return nodeList;
+}
+
+
+/**
+ * Build locator information associated with the specified relation.
+ *
+ */
+void
+RelationBuildLocator(Relation rel)
+{
+	Relation	pcrel;
+	ScanKeyData skey;
+	SysScanDesc pcscan;
+	HeapTuple	htup;
+	MemoryContext oldContext;
+	RelationLocInfo *relationLocInfo;
+	int			i;
+	int			offset;
+	Form_pgxc_class pgxc_class;
+
+
+	/** PGXCTODO temporarily use the same mapping table for all
+	 * Use all nodes.
+	 */
+	if (!locatorInited)
+	{
+		init_mapping_table(NumDataNodes, mappingTable);
+		locatorInited = true;
+	}
+
+	ScanKeyInit(&skey,
+				Anum_pgxc_class_pcrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(RelationGetRelid(rel)));
+
+	pcrel = heap_open(PgxcClassRelationId, AccessShareLock);
+	pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true,
+								SnapshotNow, 1, &skey);
+	htup = systable_getnext(pcscan);
+
+	if (!HeapTupleIsValid(htup))
+	{
+		/* Assume local relation only */
+		rel->rd_locator_info = NULL;
+		systable_endscan(pcscan);
+		heap_close(pcrel, AccessShareLock);
+		return;
+	}
+
+	pgxc_class = (Form_pgxc_class) GETSTRUCT(htup);
+
+	oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+
+	relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo));
+	rel->rd_locator_info = relationLocInfo;
+
+	relationLocInfo->relid = RelationGetRelid(rel);
+	relationLocInfo->locatorType = pgxc_class->pclocatortype;
+
+	relationLocInfo->partAttrNum = pgxc_class->pcattnum;
+
+	relationLocInfo->partAttrName = get_attname(relationLocInfo->relid,
+												pgxc_class->pcattnum);
+
+	/** PGXCTODO - add support for having nodes on a subset of nodes
+	 * For now, assume on all nodes
+	 */
+	relationLocInfo->nodeList = GetAllNodes();
+	relationLocInfo->nodeCount = relationLocInfo->nodeList->length;
+
+	/*
+	 * If the locator type is round robin, we set a node to
+	 * use next time. In addition, if it is replicated,
+	 * we choose a node to use for balancing reads.
+	 */
+	if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
+		|| relationLocInfo->locatorType == LOCATOR_TYPE_REPLICATED)
+	{
+		/*
+		 * pick a random one to start with,
+		 * since each process will do this independently
+		 */
+		srand(time(NULL));
+		offset = rand() % relationLocInfo->nodeCount + 1;
+		relationLocInfo->roundRobinNode = relationLocInfo->nodeList->head;		/* initialize */
+
+		for (i = 0; i < offset && relationLocInfo->roundRobinNode->next != NULL; i++)
+		{
+			relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next;
+		}
+	}
+
+	systable_endscan(pcscan);
+	heap_close(pcrel, AccessShareLock);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * GetLocatorRelationInfo - Returns the locator information for relation,
+ * in a copy of the RelationLocatorInfo struct in relcache
+ *
+ */
+RelationLocInfo *
+GetRelationLocInfo(Oid relid)
+{
+	RelationLocInfo *ret_loc_info = NULL;
+
+	Relation	rel = relation_open(relid, AccessShareLock);
+
+	if (rel && rel->rd_locator_info)
+		ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
+
+	relation_close(rel, AccessShareLock);
+
+	return ret_loc_info;
+}
+
+/** 
+ * Copy the RelationLocInfo struct
+ */
+RelationLocInfo *
+CopyRelationLocInfo(RelationLocInfo * src_info)
+{
+	RelationLocInfo *dest_info;
+
+
+	Assert(src_info);
+
+	dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
+
+	dest_info->relid = src_info->relid;
+	dest_info->locatorType = src_info->locatorType;
+	dest_info->partAttrNum = src_info->partAttrNum;
+	if (src_info->partAttrName)
+		dest_info->partAttrName = pstrdup(src_info->partAttrName);
+	dest_info->nodeCount = src_info->nodeCount;
+	if (src_info->nodeList)
+		dest_info->nodeList = list_copy(src_info->nodeList);
+
+	/* Note, for round robin, we use the relcache entry */
+
+	return dest_info;
+}
+
+
+/**
+ * Free RelationLocInfo struct
+ */
+void
+FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
+{
+	if (relationLocInfo)
+	{
+		if (relationLocInfo->partAttrName)
+			pfree(relationLocInfo->partAttrName);
+		pfree(relationLocInfo);
+	}
+}
diff --git a/src/backend/pgxc/plan/Makefile b/src/backend/pgxc/plan/Makefile
new file mode 100644
index 0000000000..c0e65741f1
--- /dev/null
+++ b/src/backend/pgxc/plan/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for rewrite
+#
+#
+# Portions Copyright(C) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+#    $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/plan
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = planner.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
new file mode 100644
index 0000000000..90703c4896
--- /dev/null
+++ b/src/backend/pgxc/plan/planner.c
@@ -0,0 +1,1290 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.c
+ *
+ *	  Functions for generating a PGXC style plan.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group 
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "catalog/pg_type.h"
+#include "nodes/parsenodes.h"
+#include "pgxc/locator.h"
+#include "pgxc/planner.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * Convenient format for literal comparisons
+ *
+ * PGXCTODO - make constant type Datum, handle other types
+ */
+typedef struct
+{
+	Oid			relid;
+	RelationLocInfo *rel_loc_info;
+	Oid			attrnum;
+	char	   *col_name;
+	long		constant;		/* assume long PGXCTODO - should be Datum */
+}	Literal_Comparison;
+
+/*
+ * This struct helps us detect special conditions to determine what nodes 
+ * to execute on.
+ */
+typedef struct
+{
+	List	   *partitioned_literal_comps;		/* List of Literal_Comparison */
+	List	   *partitioned_parent_child;
+	List	   *replicated_joins;
+
+	/*
+	 * Used when joining a single replicated or non-replicated table with
+	 * other replicated tables. Use as a basis for partitioning determination.
+	 */
+	char	   *base_rel_name;
+	RelationLocInfo *base_rel_loc_info;
+
+}	Special_Conditions;
+
+/* If two relations are joined based on special location information */
+typedef enum PGXCJoinType
+{
+	JOIN_REPLICATED,
+	JOIN_COLOCATED_PARTITIONED,
+	JOIN_OTHER
+}	PGXCJoinType;
+
+/* used to track which tables are joined */
+typedef struct
+{
+	int			relid1;			/* the first relation */
+	char	   *aliasname1;
+	int			relid2;			/* the second relation */
+	char	   *aliasname2;
+
+	PGXCJoinType join_type;
+}	PGXC_Join;
+
+/* A list of List*'s, one for each relation. */
+List	   *join_list = NULL;
+
+/* Forbid unsafe SQL statements */
+bool		StrictStatementChecking = true;
+
+/* Forbid multi-node SELECT statements with an ORDER BY clause */
+bool		StrictSelectChecking = false;
+
+/* 
+ * Create a new join struct for tracking how relations are joined 
+ */
+static PGXC_Join *
+new_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+	PGXC_Join   *pgxc_join = (PGXC_Join *) palloc(sizeof(PGXC_Join));
+
+	if (relid1 < relid2)
+	{
+		pgxc_join->relid1 = relid1;
+		pgxc_join->relid2 = relid2;
+		pgxc_join->aliasname1 = aliasname1;
+		pgxc_join->aliasname2 = aliasname2;
+	}
+	else
+	{
+		pgxc_join->relid1 = relid2;
+		pgxc_join->relid2 = relid1;
+		pgxc_join->aliasname1 = aliasname2;
+		pgxc_join->aliasname2 = aliasname1;
+	}
+
+	pgxc_join->join_type = JOIN_OTHER;
+
+	return pgxc_join;
+}
+
+
+/* 
+ * Look up the join struct for a particular join 
+ */
+static PGXC_Join *
+find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+	ListCell   *lc;
+
+	/* return if list is still empty */
+	if (join_list == NULL)
+		return NULL;
+
+	/* in the PGXC_Join struct, we always sort with relid1 < relid2 */
+	if (relid2 < relid1)
+	{
+		int			tmp = relid1;
+		char	   *tmpalias = aliasname1;
+
+		relid1 = relid2;
+		aliasname1 = aliasname2;
+		relid2 = tmp;
+		aliasname2 = tmpalias;
+	}
+
+	/*
+	 * there should be a small number, so we just search linearly, although
+	 * long term a hash table would be better.
+	 */
+	foreach(lc, join_list)
+	{
+		PGXC_Join   *pgxcjoin = (PGXC_Join *) lfirst(lc);
+
+		if (pgxcjoin->relid1 == relid1 && pgxcjoin->relid2 == relid2
+			&& !strcmp(pgxcjoin->aliasname1, aliasname1)
+			&& !strcmp(pgxcjoin->aliasname2, aliasname2))
+			return pgxcjoin;
+	}
+	return NULL;
+}
+
+/*
+ * Find or create a join between 2 relations
+ */
+static PGXC_Join *
+find_or_create_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2)
+{
+	PGXC_Join   *pgxcjoin;
+
+	pgxcjoin = find_pgxc_join(relid1, aliasname1, relid2, aliasname2);
+
+	if (pgxcjoin == NULL)
+	{
+		pgxcjoin = new_pgxc_join(relid1, aliasname1, relid2, aliasname2);
+		join_list = lappend(join_list, pgxcjoin);
+	}
+
+	return pgxcjoin;
+}
+
+
+/*
+ * new_special_conditions - Allocate Special_Conditions struct and initialize
+ */
+static Special_Conditions *
+new_special_conditions()
+{
+	Special_Conditions *special_conditions =
+	(Special_Conditions *) palloc0(sizeof(Special_Conditions));
+
+	return special_conditions;
+}
+
+/*
+ * free Special_Conditions struct
+ */
+static void
+free_special_relations(Special_Conditions * special_conditions)
+{
+	if (special_conditions == NULL)
+		return;
+
+	/* free all items in list, including Literal_Comparison struct */
+	list_free_deep(special_conditions->partitioned_literal_comps);
+
+	/* free list, but not items pointed to */
+	list_free(special_conditions->partitioned_parent_child);
+	list_free(special_conditions->replicated_joins);
+
+	pfree(special_conditions);
+}
+
+/*
+ * frees join_list
+ */
+static void
+free_join_list()
+{
+	if (join_list == NULL)
+		return;
+
+	/* free all items in list including PGXC_Join struct */
+	list_free_deep(join_list);
+}
+
+/*
+ * get_numeric_constant - extract casted constant
+ *
+ * Searches an expression to see if it is a Constant that is being cast 
+ * to numeric.  Return a pointer to the Constant, or NULL.
+ * We need this because of casting.
+ */
+static Expr *
+get_numeric_constant(Expr *expr)
+{
+
+	if (expr == NULL)
+		return NULL;
+
+	if (IsA(expr, Const))
+		return expr;
+
+	/* We may have a cast, represented by a function */
+	if (IsA(expr, FuncExpr))
+	{
+		FuncExpr   *funcexpr = (FuncExpr *) expr;
+
+		/* try and get at what is being cast  */
+		/* We may have an implicit double-cast, so we do this recurisvely */
+		if (funcexpr->funcid == F_NUMERIC || funcexpr->funcid == F_INT4_NUMERIC)
+		{
+			return get_numeric_constant(linitial(funcexpr->args));
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ * get_base_var_table_and_column - determine the base table and column
+ *
+ * This is required because a RangeTblEntry may actually be another
+ * type, like a join, and we need to then look at the joinaliasvars
+ * to determine what the base table and column really is.
+ */
+static Var *
+get_base_var(Var * var, List *rtables)
+{
+	RangeTblEntry *rte;
+
+	/* get the RangeTableEntry */
+	rte = list_nth(rtables, var->varno - 1);
+
+	if (rte->rtekind == RTE_RELATION)
+		return var;
+	else if (rte->rtekind == RTE_JOIN)
+	{
+		Var		   *colvar = list_nth(rte->joinaliasvars, var->varattno - 1);
+
+		/* continue resolving recursively */
+		return get_base_var(colvar, rtables);
+	}
+	else
+	{
+		return NULL;
+	}
+}
+
+
+/*
+ * get_plan_nodes_insert - determine nodes on which to execute insert.
+ */
+static List *
+get_plan_nodes_insert(Query * query)
+{
+	RangeTblEntry *rte;
+	RelationLocInfo *rel_loc_info;
+	Const	   *constant;
+	List	   *nodelist;
+	ListCell   *lc;
+	long		part_value;
+	long	   *part_value_ptr = NULL;
+
+
+	nodelist = NULL;
+
+	/* Looks complex (correlated?) - best to skip */
+	if (query->jointree != NULL && query->jointree->fromlist != NULL)
+		return NULL;
+
+	/* Make sure there is just one table */
+	if (query->rtable == NULL || query->rtable->length != 1)
+		return NULL;
+
+	rte = (RangeTblEntry *) lfirst(list_head(query->rtable));
+
+	if (rte != NULL && rte->rtekind != RTE_RELATION)
+		/* Bad relation type */
+		return NULL;
+
+	/* See if we have the partitioned case. */
+	rel_loc_info = GetRelationLocInfo(rte->relid);
+
+	if (!rel_loc_info)
+		ereport(ERROR,
+				(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+			  (errmsg("Could not find relation for oid = %d", rte->relid))));
+
+	if (rel_loc_info->locatorType == LOCATOR_TYPE_HASH
+		&& rel_loc_info->partAttrName != NULL)
+	{
+		/* It is a partitioned table, get value by looking in targetList */
+		foreach(lc, query->targetList)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+			if (tle->resjunk)
+				continue;
+
+			/*
+			 * See if we have a constant expression comparing against the
+			 * designated partitioned column
+			 */
+			if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0)
+			{
+				/* We may have a cast, try and handle it */
+				Expr	   *checkexpr = get_numeric_constant(tle->expr);
+
+				if (checkexpr == NULL)
+					break;		/* no constant */
+
+				constant = (Const *) checkexpr;
+
+				if (constant->consttype == INT4OID
+					|| constant->consttype == INT2OID
+					|| constant->consttype == INT8OID)
+				{
+					part_value = (long) constant->constvalue;
+					part_value_ptr = &part_value;
+
+				}
+				/* PGXCTODO - handle other data types */
+				/*
+				else
+					if (constant->consttype == VARCHAR ...
+				*/
+			}
+		}
+	}
+
+	/* single call handles both replicated and partitioned types */
+	nodelist = GetRelationNodes(rel_loc_info, part_value_ptr, false);
+
+	return nodelist;
+}
+
+
+/*
+ * examine_conditions
+ *
+ * Examine conditions and find special ones to later help us determine
+ * what tables can be joined together. Put findings in Special_Conditions
+ * struct.
+ *
+ * Get list of constant comparisons conditions on partitioned column
+ * Get list of parent-child joins (partitioned together)
+ * Get list of joins with replicated tables
+ *
+ * If we encounter a cross-node join, we stop processing and return false,
+ * otherwise true.
+ *
+ * PGXCTODO: Recognize subqueries, and give up (long term allow safe ones).
+ *
+ */
+static bool
+examine_conditions(Special_Conditions * conditions, List *rtables, Node *expr_node)
+{
+	char	   *rel_name,
+			   *rel_name2;
+	char	   *col_name,
+			   *col_name2;
+	RelationLocInfo *rel_loc_info1,
+			   *rel_loc_info2;
+	Const	   *constant;
+	Expr	   *checkexpr;
+
+
+	if (expr_node == NULL)
+		return true;
+
+	if (rtables == NULL)
+		return true;
+
+	if (conditions == NULL)
+		conditions = new_special_conditions();
+
+	if (IsA(expr_node, BoolExpr))
+	{
+		BoolExpr   *boolexpr = (BoolExpr *) expr_node;
+
+		/* Recursively handle ANDed expressions, but don't handle others */
+		if (boolexpr->boolop == AND_EXPR)
+		{
+			if (!examine_conditions(conditions, rtables,
+									linitial(boolexpr->args)))
+				return false;
+
+			return examine_conditions(
+							   conditions, rtables, lsecond(boolexpr->args));
+		}
+		else if (boolexpr->boolop == OR_EXPR)
+		{
+			/*
+			 * look at OR's as work-around for reported issue. 
+			 * NOTE: THIS IS NOT CORRECT, BUT JUST DONE FOR THE PROTOTYPE. 
+			 * More rigorous
+			 * checking needs to be done. PGXCTODO: Add careful checking for
+			 * OR'ed conditions...
+			 */
+			if (!examine_conditions(conditions, rtables,
+									linitial(boolexpr->args)))
+				return false;
+
+			return examine_conditions(
+							   conditions, rtables, lsecond(boolexpr->args));
+		}
+		else
+			/* looks complicated, give up */
+			return false;
+
+		return true;
+	}
+
+
+	if (IsA(expr_node, OpExpr))
+	{
+		OpExpr	   *opexpr = (OpExpr *) expr_node;
+
+		/* See if we can equijoin these */
+		if (op_mergejoinable(opexpr->opno) && opexpr->args->length == 2)
+		{
+			Expr	   *arg1 = linitial(opexpr->args);
+			Expr	   *arg2 = lsecond(opexpr->args);
+
+			/* Look for a table */
+			if (IsA(arg1, Var))
+			{
+				RangeTblEntry *rte1,
+						   *rte2;
+
+				/* get the RangeTableEntry */
+				Var		   *colvar = (Var *) arg1;
+
+				colvar = get_base_var(colvar, rtables);
+
+				if (!colvar)
+					return false;
+
+				rte1 = list_nth(rtables, colvar->varno - 1);
+
+				rel_name = get_rel_name(rte1->relid);
+				col_name = strVal(list_nth(rte1->eref->colnames,
+										   colvar->varattno - 1));
+
+				/* Look at other argument */
+
+				/* We may have a cast, try and handle it */
+				checkexpr = get_numeric_constant(arg2);
+
+				if (checkexpr != NULL)
+					arg2 = checkexpr;
+
+				if (IsA(arg2, Const))
+				{
+					/* We have column = literal. Check if partitioned case */
+					constant = (Const *) arg2;
+
+					rel_loc_info1 = GetRelationLocInfo(rte1->relid);
+
+					if (!rel_loc_info1)
+						return false;
+
+					/* If hash partitioned, check if the part column was used */
+					if (IsHashColumn(rel_loc_info1, col_name))
+					{
+						/* add to partitioned literal join conditions */
+						Literal_Comparison *lit_comp =
+						palloc(sizeof(Literal_Comparison));
+
+						lit_comp->relid = rte1->relid;
+						lit_comp->rel_loc_info = rel_loc_info1;
+						lit_comp->col_name = col_name;
+						lit_comp->constant = constant->constvalue;
+
+						conditions->partitioned_literal_comps = lappend(
+									   conditions->partitioned_literal_comps,
+																   lit_comp);
+
+						return true;
+					}
+					else
+					{
+						/* unimportant comparison, just return */
+						if (rel_loc_info1)
+							FreeRelationLocInfo(rel_loc_info1);
+						return true;
+					}
+
+				}
+				else if (IsA(arg2, Var))
+				{
+					PGXC_Join   *pgxc_join;
+					Var		   *colvar2 = (Var *) arg2;
+
+					rel_loc_info1 = GetRelationLocInfo(rte1->relid);
+
+					if (!rel_loc_info1)
+						return false;
+
+					colvar2 = get_base_var(colvar2, rtables);
+					if (!colvar2)
+						return false;
+					rte2 = list_nth(rtables, colvar2->varno - 1);
+					rel_name2 = get_rel_name(rte2->relid);
+					rel_loc_info2 = GetRelationLocInfo(rte2->relid);
+
+					/* get data struct about these two relations joining */
+					pgxc_join = find_or_create_pgxc_join(rte1->relid, rte1->eref->aliasname,
+										 rte2->relid, rte2->eref->aliasname);
+
+					/*
+					 * pgxc_join->condition_list =
+					 * lappend(pgxc_join->condition_list, opexpr);
+					 */
+
+					if (rel_loc_info1->locatorType == LOCATOR_TYPE_REPLICATED)
+					{
+						/* add to replicated join conditions */
+						conditions->replicated_joins =
+							lappend(conditions->replicated_joins, opexpr);
+
+						if (rel_loc_info2->locatorType != LOCATOR_TYPE_REPLICATED)
+						{
+							/* Note other relation, saves us work later. */
+							conditions->base_rel_name = rel_name2;
+							conditions->base_rel_loc_info = rel_loc_info2;
+							if (rel_loc_info1)
+								FreeRelationLocInfo(rel_loc_info1);
+						}
+
+						if (conditions->base_rel_name == NULL)
+						{
+							conditions->base_rel_name = rel_name;
+							conditions->base_rel_loc_info = rel_loc_info1;
+							if (rel_loc_info2)
+								FreeRelationLocInfo(rel_loc_info2);
+						}
+
+						/* note nature of join between the two relations */
+						pgxc_join->join_type = JOIN_REPLICATED;
+						return true;
+					}
+
+					if (rel_loc_info2->locatorType == LOCATOR_TYPE_REPLICATED)
+					{
+						/* add to replicated join conditions */
+						conditions->replicated_joins =
+							lappend(conditions->replicated_joins, opexpr);
+
+						/* other relation not replicated, note it for later */
+						conditions->base_rel_name = rel_name;
+						conditions->base_rel_loc_info = rel_loc_info1;
+
+						/* note nature of join between the two relations */
+						pgxc_join->join_type = JOIN_REPLICATED;
+
+						if (rel_loc_info2)
+							FreeRelationLocInfo(rel_loc_info2);
+
+						return true;
+					}
+
+					/* Now check for a partitioned join */
+
+					/*
+					 * PGXCTODO - for the prototype, we assume all partitioned
+					 * tables are on the same nodes.
+					 */
+					col_name2 = strVal(list_nth(rte2->eref->colnames,
+												colvar2->varattno - 1));
+
+					if (IsHashColumn(rel_loc_info1, col_name)
+						&& IsHashColumn(rel_loc_info2, col_name2))
+					{
+						/* We found a partitioned join */
+						conditions->partitioned_parent_child =
+							lappend(conditions->partitioned_parent_child,
+									opexpr);
+						pgxc_join->join_type = JOIN_COLOCATED_PARTITIONED;
+						return true;
+					}
+
+					/*
+					 * At this point, there is some other type of join that
+					 * can probably not be executed on only a single node.
+					 * Just return. Important: We preserve previous
+					 * pgxc_join->join_type value, there may be multiple
+					 * columns joining two tables, and we want to make sure at
+					 * least one of them make it colocated partitioned, in
+					 * which case it will update it when examining another
+					 * condition.
+					 */
+					return true;
+				}
+				else
+					return true;
+
+			}
+		}
+		/* PGXCTODO - need to more finely examine other operators */
+	}
+
+	return true;
+}
+
+/*
+ * examine_conditions_fromlist - Examine FROM clause for joins
+ *
+ * Examine FROM clause join conditions to determine special conditions
+ * to help us decide which nodes to execute on.
+ */
+static bool
+examine_conditions_fromlist(Special_Conditions * conditions, List *rtables,
+							Node *treenode)
+{
+
+	if (treenode == NULL)
+		return true;
+
+	if (rtables == NULL)
+		return true;
+
+	if (conditions == NULL)
+		conditions = new_special_conditions();
+
+	if (IsA(treenode, JoinExpr))
+	{
+		JoinExpr   *joinexpr = (JoinExpr *) treenode;
+
+		/* recursively examine FROM join tree */
+		if (!examine_conditions_fromlist(conditions, rtables, joinexpr->larg))
+			return false;
+
+		if (!examine_conditions_fromlist(conditions, rtables, joinexpr->rarg))
+			return false;
+
+		/* Now look at join condition */
+		if (!examine_conditions(conditions, rtables, joinexpr->quals))
+			return false;
+		return true;
+	}
+	else if (IsA(treenode, RangeTblRef))
+	{
+		return true;
+	}
+	else if (IsA(treenode, BoolExpr) ||IsA(treenode, OpExpr))
+	{
+		/* check base condition, if possible */
+		if (!examine_conditions(conditions, rtables, treenode))
+			return false;
+	}
+
+	/* Some other more complicated beast */
+	return false;
+}
+
+
+/*
+ * get_plan_nodes - determine the nodes to execute the command on.
+ *
+ * Examines the "special" query conditions in determining execution node list.
+ *
+ * returns NULL if it appears to be a mutli-step query.
+ */
+static List *
+get_plan_nodes(Query_Plan * query_plan, Query * query, bool isRead)
+{
+	RangeTblEntry *rte;
+	List	   *test_nodelist;
+	List	   *nodelist;
+	ListCell   *lc,
+			   *item;
+	Special_Conditions *special_conditions;
+	OpExpr	   *opexpr;
+	Var		   *colvar;
+	RelationLocInfo *rel_loc_info;
+
+
+	nodelist = NULL;
+	join_list = NULL;
+
+	/* If no tables, just return */
+	if (query->rtable == NULL && query->jointree == NULL)
+		return NULL;
+
+	/* Alloc and init struct */
+	special_conditions = new_special_conditions();
+
+	/* Look for special conditions */
+
+	/* Look for JOIN syntax joins */
+	foreach(item, query->jointree->fromlist)
+	{
+		Node	   *treenode = (Node *) lfirst(item);
+
+		if (IsA(treenode, JoinExpr))
+		{
+			if (!examine_conditions_fromlist(special_conditions, query->rtable,
+											 treenode))
+			{
+				/* if too complicated, just return NULL */
+				free_special_relations(special_conditions);
+				free_join_list();
+				return NULL;
+			}
+		}
+		else if (!IsA(treenode, RangeTblRef))
+		{
+			/* could be complicated */
+			free_special_relations(special_conditions);
+			free_join_list();
+			return NULL;
+		}
+	}
+
+
+	/* Examine the WHERE clause, too */
+	if (!examine_conditions(special_conditions, query->rtable,
+							query->jointree->quals))
+	{
+		/* if cross joins may exist, just return NULL */
+		free_special_relations(special_conditions);
+		free_join_list();
+		return NULL;
+	}
+
+	/* Examine join conditions, see if each join is single-node safe */
+	if (join_list != NULL)
+	{
+		foreach(lc, join_list)
+		{
+			PGXC_Join   *pgxcjoin = (PGXC_Join *) lfirst(lc);
+
+			/* If it is not replicated or parent-child, not single-node safe */
+			if (pgxcjoin->join_type == JOIN_OTHER)
+			{
+				free_special_relations(special_conditions);
+				free_join_list();
+				return NULL;
+			}
+		}
+	}
+
+
+	/* check for non-partitioned cases */
+	if (special_conditions->partitioned_parent_child == NULL &&
+		special_conditions->partitioned_literal_comps == NULL)
+	{
+		if (special_conditions->replicated_joins == NULL
+			&& (query->rtable == NULL || query->rtable->length > 1))
+
+			/*
+			 * This is too complicated for a single step, or there is no FROM
+			 * clause
+			 */
+			nodelist = NULL;
+		else
+		{
+			/*
+			 * We have either a single table, just replicated tables, or a
+			 * table that just joins with replicated tables.
+			 */
+
+			/* See if we noted a table earlier to use */
+			rel_loc_info = special_conditions->base_rel_loc_info;
+
+			if (rel_loc_info == NULL)
+			{
+				/* a single table, just grab it */
+				rte = (RangeTblEntry *) linitial(query->rtable);
+				rel_loc_info = GetRelationLocInfo(rte->relid);
+
+				if (!rel_loc_info)
+					return false;
+			}
+
+			nodelist = GetRelationNodes(rel_loc_info, NULL, isRead);
+		}
+	}
+	/* check for partitioned col comparison against a literal */
+	else if (special_conditions->partitioned_literal_comps != NULL
+			 && special_conditions->partitioned_literal_comps->length > 0)
+	{
+		nodelist = NULL;
+
+		/*
+		 * Make sure that if there are multiple such comparisons, that they
+		 * are all on the same nodes.
+		 */
+		foreach(lc, special_conditions->partitioned_literal_comps)
+		{
+			Literal_Comparison *lit_comp = (Literal_Comparison *) lfirst(lc);
+
+			test_nodelist = GetRelationNodes(
+						lit_comp->rel_loc_info, &(lit_comp->constant), true);
+
+			if (nodelist == NULL)
+				nodelist = test_nodelist;
+			else
+			{
+				if (nodelist->length > 1 || test_nodelist->length > 1)
+					/* there should only be one */
+					nodelist = NULL;
+				else
+				{
+					/* Make sure they use the same nodes */
+					if (linitial_int(test_nodelist) != linitial_int(nodelist))
+						nodelist = NULL;
+				}
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * At this point, we have partitioned parent child relationship, with
+		 * no partitioned column comparison condition with a literal. We just
+		 * use one of the tables as a basis for node determination.
+		 */
+		opexpr = (OpExpr *) linitial(special_conditions->partitioned_parent_child);
+
+		colvar = (Var *) linitial(opexpr->args);
+
+		/* get the RangeTableEntry */
+		rte = list_nth(query->rtable, colvar->varno - 1);
+		rel_loc_info = GetRelationLocInfo(rte->relid);
+
+		if (!rel_loc_info)
+			return false;
+
+		nodelist = GetRelationNodes(rel_loc_info, NULL, isRead);
+	}
+	free_special_relations(special_conditions);
+	free_join_list();
+
+	return nodelist;
+}
+
+
+/* 
+ * get_plan_nodes - determine the nodes to execute the plan on
+ *
+ * return NULL if it is not safe to be done in a single step.
+ */
+static List *
+get_plan_nodes_command(Query_Plan * query_plan, Query * query)
+{
+
+	switch (query->commandType)
+	{
+		case CMD_SELECT:
+			return get_plan_nodes(query_plan, query, true);
+
+		case CMD_INSERT:
+			return get_plan_nodes_insert(query);
+
+		case CMD_UPDATE:
+			/* treat as a select */
+			return get_plan_nodes(query_plan, query, false);
+
+		case CMD_DELETE:
+			/* treat as a select */
+			return get_plan_nodes(query_plan, query, false);
+
+		default:
+			return NULL;
+	}
+}
+
+
+/*
+ * Get list of simple aggregates used.
+ * For now we only allow MAX in the first column, and return a list of one.
+ */
+static List *
+get_simple_aggregates(Query * query, List *nodelist)
+{
+	List	   *simple_agg_list = NULL;
+
+	/* Check for simple multi-node aggregate */
+	if (nodelist != NULL && nodelist->length > 1 && query->hasAggs)
+	{
+		TargetEntry *tle;
+
+		/*
+		 * long term check for group by, but for prototype just allow 1 simple
+		 * expression
+		 */
+		if (query->targetList->length != 1)
+			return NULL;
+
+		tle = (TargetEntry *) linitial(query->targetList);
+
+		if (IsA(tle->expr, Aggref))
+		{
+			SimpleAgg  *simple_agg;
+			Aggref	   *aggref = (Aggref *) tle->expr;
+
+			/* Just consider numeric max functions for prototype */
+			if (!(aggref->aggfnoid >= 2115 && aggref->aggfnoid <= 2121))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("Multinode aggregate for this function currently not supported")));
+			}
+
+			simple_agg = (SimpleAgg *) palloc(sizeof(SimpleAgg));
+			simple_agg->agg_type = AGG_TYPE_MAX;
+			simple_agg->column_pos = 1;
+			simple_agg->agg_data_type = aggref->aggtype;
+			simple_agg->response_count = 0;
+
+			simple_agg_list = lappend(simple_agg_list, simple_agg);
+		}
+	}
+
+	return simple_agg_list;
+}
+
+
+/*
+ * Build up a QueryPlan to execute on.
+ *
+ * For the prototype, there will only be one step,
+ * and the nodelist will be NULL if it is not a PGXC-safe statement.
+ */
+Query_Plan *
+GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list)
+{
+	Query_Plan *query_plan = palloc(sizeof(Query_Plan));
+	Query_Step *query_step = palloc(sizeof(Query_Step));
+	Query	   *query;
+
+
+	query_plan->force_autocommit = false;
+
+	query_step->sql_statement = (char *) palloc(strlen(sql_statement) + 1);
+	strcpy(query_step->sql_statement, sql_statement);
+	query_step->nodelist = NULL;
+	query_step->simple_aggregates = NULL;
+
+	query_plan->query_step_list = lappend(NULL, query_step);
+
+	/*
+	 * Determine where to execute the command, either at the Coordinator
+	 * level, Data Nodes, or both. By default we choose both. We should be
+	 * able to quickly expand this for more commands.
+	 */
+	switch (nodeTag(parsetree))
+	{
+		case T_SelectStmt:
+		case T_InsertStmt:
+		case T_UpdateStmt:
+		case T_DeleteStmt:
+			/* just use first one in querytree_list */
+			query = (Query *) linitial(querytree_list);
+			query_step->nodelist =
+				get_plan_nodes_command(query_plan, query);
+			query_step->simple_aggregates =
+				get_simple_aggregates(query, query_step->nodelist);
+
+			/*
+			 * See if it is a SELECT with no relations, like SELECT 1+1 or
+			 * SELECT nextval('fred'), and just use coord.
+			 */
+			query = (Query *) linitial(querytree_list);
+			if (query_step->nodelist == NULL
+						&& (query->jointree->fromlist == NULL
+						|| query->jointree->fromlist->length == 0))
+				/* Just execute it on Coordinator */
+				query_plan->exec_loc_type = EXEC_ON_COORD;
+			else
+			{
+				query_plan->exec_loc_type = EXEC_ON_DATA_NODES;
+
+				if (query_step->nodelist == NULL)
+				{
+					bool		is_pg_catalog = false;
+
+					/* before giving up, see if we are dealing with pg_catalog */
+					if (nodeTag(parsetree) == T_SelectStmt)
+					{
+						ListCell   *lc;
+
+						is_pg_catalog = true;
+						foreach(lc, query->rtable)
+						{
+							RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+
+							/* hack so that pg_catalog queries can run */
+							if (rte->relid >= FirstNormalObjectId)
+							{
+								is_pg_catalog = false;
+								break;
+							}
+						}
+						if (is_pg_catalog)
+							query_plan->exec_loc_type = EXEC_ON_COORD;
+					}
+
+					/*
+					 * If the nodelist is NULL, it is not safe for us to
+					 * execute
+					 */
+					if (!is_pg_catalog && StrictStatementChecking)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+								 (errmsg("Cannot safely execute statement in a single step."))));
+				}
+			}
+
+			/*
+			 * PG-XC cannot yet support some variations of SQL statements.
+			 * We perform some checks to at least catch common cases
+			 */
+
+			/*
+			 * Check if we have multiple nodes and an unsupported clause. This
+			 * is temporary until we expand supported SQL
+			 */
+			if (nodeTag(parsetree) == T_SelectStmt)
+			{
+				if (query->intoClause)
+					ereport(ERROR,
+							(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+							 (errmsg("INTO clause not yet supported"))));
+
+				if (query->setOperations)
+					ereport(ERROR,
+							(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+							 (errmsg("UNION, INTERSECT and EXCEPT are not yet supported"))));
+
+				if (query_step->nodelist && query_step->nodelist->length > 1 && StrictStatementChecking)
+				{
+					/*
+					 * PGXCTODO - this could be improved to check if the first
+					 * group by expression is the partitioning column
+					 */
+					if (query->groupClause)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+						 (errmsg("Multi-node GROUP BY not yet supported"))));
+					if (query->limitCount && StrictSelectChecking)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+							(errmsg("Multi-node LIMIT not yet supported"))));
+					if (query->sortClause && StrictSelectChecking)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+						 (errmsg("Multi-node ORDER BY not yet supported"))));
+					/* PGXCTODO - check if first column partitioning column */
+					if (query->distinctClause)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+						 (errmsg("Multi-node DISTINCT`not yet supported"))));
+					if (query->hasAggs)
+						ereport(ERROR,
+								(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+						(errmsg("Multi-node aggregates not yet supported"))));
+				}
+			}
+			break;
+
+			/* Statements that we only want to execute on the Coordinator */
+		case T_AlterSeqStmt:
+		case T_CommentStmt:
+		case T_CreateSeqStmt:
+		case T_VariableShowStmt:
+			query_plan->exec_loc_type = EXEC_ON_COORD;
+			break;
+
+			/* DROP */
+		case T_DropStmt:
+			if (((DropStmt *) parsetree)->removeType == OBJECT_SEQUENCE)
+				query_plan->exec_loc_type = EXEC_ON_COORD;
+			else
+				query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+			break;
+
+			/*
+			 * Statements that need to run in autocommit mode, on Coordinator
+			 * and Data Nodes with suppressed implicit two phase commit.
+			 */
+		case T_CheckPointStmt:
+		case T_ClusterStmt:
+		case T_CreatedbStmt:
+		case T_DropdbStmt:
+		case T_VacuumStmt:
+			query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+			query_plan->force_autocommit = true;
+			break;
+
+			/*
+			 * Statements that we execute on both the Coordinator and Data Nodes
+			 */
+		case T_AlterTableStmt:
+		case T_AlterDatabaseStmt:
+		case T_AlterDatabaseSetStmt:
+		case T_AlterDomainStmt:
+		case T_AlterObjectSchemaStmt:
+		case T_ConstraintsSetStmt:
+		case T_CreateDomainStmt:
+		case T_CreateEnumStmt:
+		case T_CreateStmt:
+		case T_CreateSchemaStmt:
+		case T_DeallocateStmt:	/* Allow for DEALLOCATE ALL */
+		case T_DiscardStmt:
+		case T_IndexStmt:
+		case T_LockStmt:
+		case T_ReindexStmt:
+		case T_RenameStmt:
+		case T_TruncateStmt:
+		case T_VariableSetStmt:
+
+			/*
+			 * Also support these, should help later with pg_restore, although
+			 * not very useful because of the pooler using the same user
+			 */
+		case T_GrantStmt:
+		case T_GrantRoleStmt:
+		case T_CreateRoleStmt:
+		case T_AlterRoleStmt:
+		case T_DropRoleStmt:
+		case T_AlterOwnerStmt:
+		case T_DropOwnedStmt:
+		case T_ReassignOwnedStmt:
+			query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+			break;
+
+
+		case T_TransactionStmt:
+			switch (((TransactionStmt *) parsetree)->kind)
+			{
+				case TRANS_STMT_SAVEPOINT:
+				case TRANS_STMT_RELEASE:
+				case TRANS_STMT_ROLLBACK_TO:
+					ereport(ERROR,
+							(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+							 (errmsg("This type of transaction statement not yet supported"))));
+					break;
+
+				default:
+					break; /* keep compiler quiet */
+			}
+			query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+			break;
+
+			/*
+			 * For now, pick one of the data nodes until we modify real
+			 * planner It will give an approximate idea of what an isolated
+			 * data node will do
+			 */
+		case T_ExplainStmt:
+			query_step->nodelist = lappend_int(query_step->nodelist, GetAnyDataNode());
+			query_plan->exec_loc_type = EXEC_ON_DATA_NODES;
+			break;
+
+			/*
+			 * Statements we do not yet want to handle.
+			 * By default they would  be fobidden, but we list these for reference.
+			 * Note that there is not a 1-1 correspndence between
+			 * SQL command and the T_*Stmt structures.
+			 */
+		case T_AlterFdwStmt:
+		case T_AlterForeignServerStmt:
+		case T_AlterFunctionStmt:
+		case T_AlterOpFamilyStmt:
+		case T_AlterTSConfigurationStmt:
+		case T_AlterTSDictionaryStmt:
+		case T_AlterUserMappingStmt:
+		case T_ClosePortalStmt:
+		case T_CompositeTypeStmt:
+		case T_CreateCastStmt:
+		case T_CreateConversionStmt:
+		case T_CreateFdwStmt:
+		case T_CreateFunctionStmt:
+		case T_CreateForeignServerStmt:
+		case T_CreateOpClassStmt:
+		case T_CreateOpFamilyStmt:
+		case T_CreatePLangStmt:
+		case T_CreateTableSpaceStmt:
+		case T_CreateTrigStmt:
+		case T_CreateUserMappingStmt:
+		case T_DeclareCursorStmt:
+		case T_DefineStmt:		/* used for aggregates, some types */
+		case T_DropCastStmt:
+		case T_DropFdwStmt:
+		case T_DropForeignServerStmt:
+		case T_DropPLangStmt:
+		case T_DropPropertyStmt:
+		case T_DropTableSpaceStmt:
+		case T_ExecuteStmt:
+		case T_FetchStmt:
+		case T_ListenStmt:
+		case T_LoadStmt:
+		case T_NotifyStmt:
+		case T_PrepareStmt:
+		case T_RemoveFuncStmt:
+		case T_RemoveOpClassStmt:
+		case T_RemoveOpFamilyStmt:
+		case T_RuleStmt:
+		case T_UnlistenStmt:
+		case T_ViewStmt:
+			/* fall through */
+		default:
+			/* Allow for override */
+			if (StrictStatementChecking)
+				ereport(ERROR,
+						(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+						 (errmsg("This command is not yet supported."))));
+			else
+				query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES;
+			break;
+	}
+
+
+	return query_plan;
+}
+
+
+/*
+ * Free Query_Step struct
+ */
+static void
+free_query_step(Query_Step * query_step)
+{
+	if (query_step == NULL)
+		return;
+
+	pfree(query_step->sql_statement);
+	list_free(query_step->nodelist);
+	if (query_step->simple_aggregates != NULL)
+		list_free_deep(query_step->simple_aggregates);
+	pfree(query_step);
+}
+
+/*
+ * Free Query_Plan struct
+ */
+void
+FreeQueryPlan(Query_Plan * query_plan)
+{
+	ListCell   *item;
+
+	if (query_plan == NULL)
+		return;
+
+	foreach(item, query_plan->query_step_list)
+	{
+		free_query_step((Query_Step *) lfirst_int(item));
+	}
+
+	pfree(query_plan->query_step_list);
+	pfree(query_plan);
+}
diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile
new file mode 100644
index 0000000000..7143af5d97
--- /dev/null
+++ b/src/backend/pgxc/pool/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for pool
+#
+#  Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+#    $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/pool
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = combiner.o datanode.o poolmgr.o poolcomm.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/pool/combiner.c b/src/backend/pgxc/pool/combiner.c
new file mode 100644
index 0000000000..da59c5f6af
--- /dev/null
+++ b/src/backend/pgxc/pool/combiner.c
@@ -0,0 +1,375 @@
+/*-------------------------------------------------------------------------
+ *
+ * combiner.c
+ *
+ *	  Combine responses from multiple Data Nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "pgxc/combiner.h"
+#include "pgxc/planner.h"
+#include "catalog/pg_type.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Create a structure to store parameters needed to combine responses from
+ * multiple connections as well as state information
+ */
+ResponseCombiner
+CreateResponseCombiner(int node_count, CombineType combine_type,
+					   CommandDest dest)
+{
+	ResponseCombiner combiner;
+
+	/* ResponseComber is a typedef for pointer to ResponseCombinerData */
+	combiner = (ResponseCombiner) palloc(sizeof(ResponseCombinerData));
+	if (combiner == NULL)
+	{
+		/* Out of memory */
+		return combiner;
+	}
+
+	combiner->node_count = node_count;
+	combiner->combine_type = combine_type;
+	combiner->dest = dest;
+	combiner->command_complete_count = 0;
+	combiner->row_count = 0;
+	combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+	combiner->description_count = 0;
+	combiner->simple_aggregates = NULL;
+
+	return combiner;
+}
+
+/*
+ * Parse out row count from the command status response and convert it to integer
+ */
+static int
+parse_row_count(const char *message, size_t len, int *rowcount)
+{
+	int			digits = 0;
+
+	*rowcount = 0;
+	/* skip \0 string terminator */
+	len--;
+	while (len-- > 0 && message[len] >= '0' && message[len] <= '9')
+	{
+		*rowcount = *rowcount * 10 + message[len] - '0';
+		digits++;
+	}
+	return digits;
+}
+
+/*
+ * Extract the aggregate element result
+ * returns a boolean indicating whether or not it was a short message
+ */
+static unsigned long
+parse_aggregate_value(SimpleAgg * simple_agg, char *msg_body, size_t len)
+{
+	char	   *valstr;
+
+	Assert(len >= 7);
+
+	/* PGXCTODO - handle pos (position) */
+	/* PGXCTODO - handle other types like TEXT */
+
+	/* skip first 2 bytes */
+	if (simple_agg->data_len == 0)
+		memcpy(&(simple_agg->data_len), &(msg_body[2]), 4);
+
+	valstr = (char *) palloc(simple_agg->data_len + 1);
+	strncpy(valstr, &(msg_body[6]), simple_agg->data_len);
+	valstr[simple_agg->data_len - 1] = '\0';
+
+	return atol(valstr);
+}
+
+
+/*
+ * Process a result from a node for the aggregate function
+ * returns a boolean indicating whether or not it was a short message
+ */
+static void
+process_aggregate_element(List *simple_aggregates, char *msg_body, size_t len)
+{
+	ListCell   *lc;
+
+	foreach(lc, simple_aggregates)
+	{
+		unsigned long col_value;
+		SimpleAgg  *simple_agg = (SimpleAgg *) lfirst(lc);
+
+		/* PGXCTODO may need to support numeric, too. */
+		col_value = parse_aggregate_value(simple_agg, msg_body, len);
+
+		switch (simple_agg->agg_type)
+		{
+			case AGG_TYPE_MAX:
+				/* If it is the first one, take it */
+				if (simple_agg->response_count == 0)
+				{
+					/* PGXCTODO - type checking */
+					simple_agg->ulong_value = col_value;
+				}
+				else
+				{
+					if (col_value > simple_agg->ulong_value)
+						simple_agg->ulong_value = col_value;
+				}
+				break;
+
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("Unknown aggregate type: %d",
+								simple_agg->agg_type)));
+		}
+
+	}
+}
+
+
+/*
+ * Handle response message and update combiner's state.
+ * This function contains main combiner logic
+ */
+int
+CombineResponse(ResponseCombiner combiner, char msg_type, char *msg_body, size_t len)
+{
+	int			rowcount;
+	int			digits = 0;
+
+	switch (msg_type)
+	{
+		case 'C':				/* CommandComplete */
+			/*
+			 * If we did not receive description we are having rowcount or OK
+			 * response
+			 */
+			if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+				combiner->request_type = REQUEST_TYPE_COMMAND;
+			/* Extract rowcount */
+			if (combiner->combine_type != COMBINE_TYPE_NONE)
+			{
+				digits = parse_row_count(msg_body, len, &rowcount);
+				if (digits > 0)
+					combiner->row_count += rowcount;
+				else
+					combiner->combine_type = COMBINE_TYPE_NONE;
+			}
+			if (++combiner->command_complete_count == combiner->node_count)
+			{
+
+				if (combiner->dest == DestRemote
+					|| combiner->dest == DestRemoteExecute)
+				{
+					if (combiner->combine_type == COMBINE_TYPE_NONE)
+					{
+						pq_putmessage(msg_type, msg_body, len);
+					}
+					else
+					{
+						char		command_complete_buffer[256];
+
+						rowcount = combiner->combine_type == COMBINE_TYPE_SUM ?
+							combiner->row_count :
+							combiner->row_count / combiner->node_count;
+						/* Truncate msg_body to get base string */
+						msg_body[len - digits - 1] = '\0';
+						len = sprintf(command_complete_buffer, "%s%d", msg_body, rowcount) + 1;
+						pq_putmessage(msg_type, command_complete_buffer, len);
+					}
+				}
+			}
+			break;
+		case 'T':				/* RowDescription */
+			if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+				combiner->request_type = REQUEST_TYPE_QUERY;
+			if (combiner->request_type != REQUEST_TYPE_QUERY)
+			{
+				/* Inconsistent responses */
+				return EOF;
+			}
+			/* Proxy first */
+			if (combiner->description_count++ == 0)
+			{
+				if (combiner->dest == DestRemote
+					|| combiner->dest == DestRemoteExecute)
+					pq_putmessage(msg_type, msg_body, len);
+			}
+			break;
+		case 'G':				/* CopyInResponse */
+			if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+				combiner->request_type = REQUEST_TYPE_COPY_IN;
+			if (combiner->request_type != REQUEST_TYPE_COPY_IN)
+			{
+				/* Inconsistent responses */
+				return EOF;
+			}
+			/* Proxy first */
+			if (combiner->description_count++ == 0)
+			{
+				if (combiner->dest == DestRemote
+					|| combiner->dest == DestRemoteExecute)
+					pq_putmessage(msg_type, msg_body, len);
+			}
+			break;
+		case 'H':				/* CopyOutResponse */
+			if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+				combiner->request_type = REQUEST_TYPE_COPY_OUT;
+			if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+			{
+				/* Inconsistent responses */
+				return EOF;
+			}
+			/* Proxy first */
+			if (combiner->description_count++ == 0)
+			{
+				if (combiner->dest == DestRemote
+					|| combiner->dest == DestRemoteExecute)
+					pq_putmessage(msg_type, msg_body, len);
+			}
+			break;
+		case 'D':				/* DataRow */
+			if (combiner->simple_aggregates == NULL)
+			{
+				if (combiner->dest == DestRemote
+					|| combiner->dest == DestRemoteExecute)
+					pq_putmessage(msg_type, msg_body, len);
+			}
+			else
+			{
+				SimpleAgg  *simple_agg = (SimpleAgg *) linitial(combiner->simple_aggregates);
+
+				/* Handle aggregates */
+				/* Process single node result */
+				process_aggregate_element(
+										  combiner->simple_aggregates,
+										  msg_body, len);
+
+				/*
+				 * See if we are done with all nodes. Only then do we send one
+				 * DataRow result.
+				 */
+
+				if (++simple_agg->response_count
+					== combiner->node_count)
+				{
+					char		longstr[21];
+					int			longlen;
+
+					StringInfo	data_buffer;
+
+					data_buffer = makeStringInfo();
+
+					/*
+					 * longlen = sprintf(longstr, "%lu",
+					 * simple_agg->ulong_value);
+					 */
+
+					pg_ltoa(simple_agg->ulong_value, longstr);
+					longlen = strlen(longstr);
+
+					pq_beginmessage(data_buffer, 'D');
+					pq_sendbyte(data_buffer, msg_body[0]);
+					pq_sendbyte(data_buffer, msg_body[1]);
+					pq_sendint(data_buffer, longlen, 4);
+					pq_sendtext(data_buffer, longstr, longlen);
+					pq_putmessage(msg_type,
+								  data_buffer->data,
+								  data_buffer->len);
+
+					pfree(data_buffer->data);
+					pfree(data_buffer);
+				}
+			}
+			break;
+		case 'E':				/* ErrorResponse */
+		case 'A':				/* NotificationResponse */
+		case 'N':				/* NoticeResponse */
+			/* Always proxy */
+			if (combiner->dest == DestRemote
+				|| combiner->dest == DestRemoteExecute)
+				pq_putmessage(msg_type, msg_body, len);
+			break;
+		case 'I':				/* EmptyQuery */
+		default:
+			/* Unexpected message */
+			return EOF;
+	}
+	return 0;
+}
+
+/*
+ * Examine the specified combiner state and determine if command was completed
+ * successfully
+ */
+static bool
+validate_combiner(ResponseCombiner combiner)
+{
+	/* Check all nodes completed */
+	if (combiner->command_complete_count != combiner->node_count)
+		return false;
+
+	/* Check count of description responses */
+	if (combiner->request_type != REQUEST_TYPE_COMMAND
+		&& combiner->description_count != combiner->node_count)
+		return false;
+
+	/* Add other checks here as needed */
+
+	/* All is good if we are here */
+	return true;
+}
+
+/*
+ * Validate combiner and release storage freeing allocated memory
+ */
+bool
+ValidateAndCloseCombiner(ResponseCombiner combiner)
+{
+	bool		valid = validate_combiner(combiner);
+
+	pfree(combiner);
+
+	return valid;
+}
+
+/*
+ * Validate combiner and reset storage
+ */
+bool
+ValidateAndResetCombiner(ResponseCombiner combiner)
+{
+	bool		valid = validate_combiner(combiner);
+
+	combiner->command_complete_count = 0;
+	combiner->row_count = 0;
+	combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+	combiner->description_count = 0;
+	combiner->simple_aggregates = NULL;
+
+	return valid;
+}
+
+/*
+ * Assign combiner aggregates
+ */
+void
+AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates)
+{
+	combiner->simple_aggregates = simple_aggregates;
+}
diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c
new file mode 100644
index 0000000000..9b3d40a785
--- /dev/null
+++ b/src/backend/pgxc/pool/datanode.c
@@ -0,0 +1,1701 @@
+/*-------------------------------------------------------------------------
+ *
+ * datanode.c
+ *
+ *	  Functions for the coordinator communicating with the data nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "pgxc/poolmgr.h"
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "postgres.h"
+#include "utils/snapmgr.h"
+#include "gtm/gtm_c.h"
+#include "pgxc/datanode.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+
+#define NO_SOCKET -1
+
+static int	node_count = 0;
+static DataNodeHandle *handles = NULL;
+static bool autocommit = true;
+static DataNodeHandle **write_node_list = NULL;
+static int	write_node_count = 0;
+
+static DataNodeHandle **get_handles(List *nodelist);
+static int	get_transaction_nodes(DataNodeHandle ** connections);
+static void release_handles(void);
+
+static void data_node_init(DataNodeHandle * handle, int sock);
+static void data_node_free(DataNodeHandle * handle);
+
+static int	data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid);
+static int	data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner);
+static int	data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner);
+
+static int	ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle);
+static int	ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle);
+
+static int	data_node_send_query(DataNodeHandle * handle, const char *query);
+static int	data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid);
+static int	data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot);
+
+static void add_error_message(DataNodeHandle * handle, const char *message);
+
+static int	data_node_read_data(DataNodeHandle * conn);
+static int	handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState);
+
+static int	get_int(DataNodeHandle * conn, size_t len, int *out);
+static int	get_char(DataNodeHandle * conn, char *out);
+
+static void clear_write_node_list();
+
+#define MAX_STATEMENTS_PER_TRAN 10
+
+/* Variables to collect statistics */
+static int	total_transactions = 0;
+static int	total_statements = 0;
+static int	total_autocommit = 0;
+static int	nonautocommit_2pc = 0;
+static int	autocommit_2pc = 0;
+static int	current_tran_statements = 0;
+static int *statements_per_transaction = NULL;
+static int *nodes_per_transaction = NULL;
+
+/*
+ * statistics collection: count a statement
+ */
+static void
+stat_statement()
+{
+	total_statements++;
+	current_tran_statements++;
+}
+
+/*
+ * To collect statistics: count a transaction
+ */
+static void
+stat_transaction(int node_count)
+{
+	total_transactions++;
+	if (autocommit)
+		total_autocommit++;
+	if (!statements_per_transaction)
+	{
+		statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+		memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+	}
+	if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+		statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+	else
+		statements_per_transaction[current_tran_statements]++;
+	current_tran_statements = 0;
+	if (node_count > 0 && node_count <= NumDataNodes)
+	{
+		if (!nodes_per_transaction)
+		{
+			nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+			memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+		}
+		nodes_per_transaction[node_count - 1]++;
+	}
+}
+
+
+/*
+ * To collect statistics: count a two-phase commit on nodes
+ */
+static void
+stat_2pc()
+{
+	if (autocommit)
+		autocommit_2pc++;
+	else
+		nonautocommit_2pc++;
+}
+
+
+/*
+ * Output collected statistics to the log
+ */
+static void
+stat_log()
+{
+	elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+	elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+		 total_autocommit, autocommit_2pc, nonautocommit_2pc);
+	if (total_transactions)
+	{
+		if (statements_per_transaction)
+		{
+			int			i;
+
+			for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+				elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+					 i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+		}
+		elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+			 MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+		if (nodes_per_transaction)
+		{
+			int			i;
+
+			for (i = 0; i < NumDataNodes; i++)
+				elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+					 i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+		}
+	}
+}
+
+/*
+ * Allocate and initialize memory to store DataNode handles.
+ */
+void
+InitMultinodeExecutor()
+{
+	int			i;
+
+	/* This function could get called multiple times because of sigjmp */
+	if (handles != NULL)
+		return;
+
+	/*
+	 * Should be in TopMemoryContext.
+	 * Assume the caller takes care of context switching
+	 */
+	handles = (DataNodeHandle *) palloc(NumDataNodes * sizeof(DataNodeHandle));
+	if (!handles)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+
+	/* initialize storage then */
+	for (i = 0; i < NumDataNodes; i++)
+	{
+		/*
+		 * Socket descriptor is small non-negative integer,
+		 * Indicate the handle is not initialized yet
+		 */
+		handles[i].sock = NO_SOCKET;
+
+		/* Initialise buffers */
+		handles[i].error = NULL;
+		handles[i].outSize = 16 * 1024;
+		handles[i].outBuffer = (char *) palloc(handles[i].outSize);
+		handles[i].inSize = 16 * 1024;
+		handles[i].inBuffer = (char *) palloc(handles[i].inSize);
+
+		if (handles[i].outBuffer == NULL || handles[i].inBuffer == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+	}
+
+	node_count = 0;
+}
+
+/* 
+ * Builds up a connection string 
+ */
+char *
+DataNodeConnStr(char *host, char *port, char *dbname,
+				char *user, char *password)
+{
+	char	   *out,
+				connstr[256];
+	int			num;
+
+	/* Build up connection string */
+	num = snprintf(connstr, sizeof(connstr),
+				   "host=%s port=%s dbname=%s user=%s password=%s",
+				   host, port, dbname, user, password);
+
+	/* Check for overflow */
+	if (num > 0 && num < sizeof(connstr))
+	{
+		/* Output result */
+		out = (char *) palloc(num + 1);
+		strcpy(out, connstr);
+		return out;
+	}
+
+	/* return NULL if we have problem */
+	return NULL;
+}
+
+
+/* 
+ * Connect to a Data Node using a connection string  
+ */
+NODE_CONNECTION *
+DataNodeConnect(char *connstr)
+{
+	PGconn	   *conn;
+
+	/* Delegate call to the pglib */
+	conn = PQconnectdb(connstr);
+	return (NODE_CONNECTION *) conn;
+}
+
+
+/* 
+ * Close specified connection 
+ */
+void
+DataNodeClose(NODE_CONNECTION * conn)
+{
+	/* Delegate call to the pglib */
+	PQfinish((PGconn *) conn);
+}
+
+
+/* 
+ * Checks if connection active 
+ */
+int
+DataNodeConnected(NODE_CONNECTION * conn)
+{
+	/* Delegate call to the pglib */
+	PGconn	   *pgconn = (PGconn *) conn;
+
+	/*
+	 * Simple check, want to do more comprehencive -
+	 * check if it is ready for guery
+	 */
+	return pgconn && PQstatus(pgconn) == CONNECTION_OK;
+}
+
+
+
+/* Close the socket handle (this process' copy) and free occupied memory
+ *
+ * Note that we do not free the handle and its members. This will be
+ * taken care of when the transaction ends, when TopTransactionContext
+ * is destroyed in xact.c.
+ */
+static void
+data_node_free(DataNodeHandle * handle)
+{
+	close(handle->sock);
+	handle->sock = NO_SOCKET;
+}
+
+
+/*
+ * Create and initialise internal structure to communicate to
+ * Data Node via supplied socket descriptor.
+ * Structure stores state info and I/O buffers
+ */
+static void
+data_node_init(DataNodeHandle * handle, int sock)
+{
+	handle->sock = sock;
+	handle->transaction_status = 'I';
+	handle->state = DN_CONNECTION_STATE_IDLE;
+	handle->error = NULL;
+	handle->outEnd = 0;
+	handle->inStart = 0;
+	handle->inEnd = 0;
+	handle->inCursor = 0;
+}
+
+
+/*
+ * Handle responses from the Data node connections 
+ */
+static int
+data_node_receive_responses(int conn_count, DataNodeHandle ** connections,
+						 struct timeval * timeout, ResponseCombiner combiner)
+{
+	int			result = 0;
+	int			retry_count;
+	bool		timed_out = false;
+	bool		inErrorState = false;
+
+	int			count = conn_count;
+	DataNodeHandle *to_receive[conn_count];
+
+	/* make a copy of the pointers to the connections */
+	memcpy(to_receive, connections, conn_count * sizeof(DataNodeHandle *));
+
+	/*
+	 * Read results.
+	 * Note we try and read from data node connections even if there is an error on one,
+	 * so as to avoid reading incorrect results on the next statement.
+	 * It might be better to just destroy these connections and tell the pool manager.
+	 */
+	while (count > 0)
+	{
+		int			i,
+					res_select,
+					nfds = 0;
+		fd_set		readfds;
+
+		FD_ZERO(&readfds);
+		for (i = 0; i < count; i++)
+		{
+			/* note if a connection has error */
+			if (!to_receive[i]
+				|| to_receive[i]->state == DN_CONNECTION_STATE_ERROR
+				|| to_receive[i]->sock >= 1024)
+			{
+				result = EOF;
+
+				/* Handling is done, do not track this connection */
+				count--;
+
+				/* Move last connection in its place */
+				if (i < count)
+				{
+					to_receive[i] = to_receive[count];
+					/* stay on the current position */
+					i--;
+				}
+				continue;
+			}
+
+			/* prepare select params */
+			if (nfds < to_receive[i]->sock)
+				nfds = to_receive[i]->sock;
+
+			FD_SET		(to_receive[i]->sock, &readfds);
+		}
+
+		/* Make sure we still have valid connections */
+		if (count == 0)
+			break;
+
+		retry_count = 0;
+retry:
+		res_select = select(nfds + 1, &readfds, NULL, NULL, timeout);
+		if (res_select < 0)
+		{
+			/* error - retry if EINTR or EAGAIN */
+			if (errno == EINTR || errno == EAGAIN)
+				goto retry;
+
+			/*
+			 * PGXCTODO - we may want to close the connections and notify the
+			 * pooler that these are invalid.
+			 */
+			if (errno == EBADF)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_CONNECTION_FAILURE),
+						 errmsg("select() bad file descriptor set")));
+				return EOF;
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("select() error: %d", errno)));
+			return EOF;
+		}
+
+		if (res_select == 0)
+		{
+			/* Handle timeout */
+			result = EOF;
+			timed_out = true;
+		}
+
+		/* read data */
+		for (i = 0; i < count; i++)
+		{
+			DataNodeHandle *conn = to_receive[i];
+
+			if (FD_ISSET(conn->sock, &readfds))
+			{
+				int			read_status = data_node_read_data(conn);
+
+				if (read_status == EOF || read_status < 0)
+				{
+					count--;
+					/* Move last connection in place */
+					if (i < count)
+					{
+						to_receive[i] = to_receive[count];
+						/* stay on the current position */
+						i--;
+					}
+
+					inErrorState = true;
+					result = EOF;
+					continue;
+				}
+			}
+
+			if (conn->inStart < conn->inEnd)
+			{
+				if (handle_response(conn, combiner, inErrorState) == 0)
+				{
+					/* Handling is done, do not track this connection */
+					count--;
+					/* Move last connection in place */
+					if (i < count)
+					{
+						to_receive[i] = to_receive[count];
+						/* stay on the current position */
+						i--;
+					}
+				}
+
+				/*
+				 * See if we flagged an error on connection. Note, if
+				 * handle_response was not 0 above, an error occurred, we
+				 * still need to consume the ReadyForQuery message
+				 */
+				if (conn->state == DN_CONNECTION_STATE_ERROR)
+				{
+					inErrorState = true;
+					result = EOF;
+				}
+			}
+		}
+	}
+
+	return result;
+}
+
+/*
+ * Read up incoming messages from the Data ndoe connection
+ */
+static int
+data_node_read_data(DataNodeHandle * conn)
+{
+	int			someread = 0;
+	int			nread;
+
+	if (conn->sock < 0)
+	{
+		add_error_message(conn, "bad socket");
+		return EOF;
+	}
+
+	/* Left-justify any data in the buffer to make room */
+	if (conn->inStart < conn->inEnd)
+	{
+		if (conn->inStart > 0)
+		{
+			memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+					conn->inEnd - conn->inStart);
+			conn->inEnd -= conn->inStart;
+			conn->inCursor -= conn->inStart;
+			conn->inStart = 0;
+		}
+	}
+	else
+	{
+		/* buffer is logically empty, reset it */
+		conn->inStart = conn->inCursor = conn->inEnd = 0;
+	}
+
+	/*
+	 * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+	 * the buffer in case a single message exceeds the initial buffer size. We
+	 * enlarge before filling the buffer entirely so as to avoid asking the
+	 * kernel for a partial packet. The magic constant here should be large
+	 * enough for a TCP packet or Unix pipe bufferload.  8K is the usual pipe
+	 * buffer size, so...
+	 */
+	if (conn->inSize - conn->inEnd < 8192)
+	{
+		if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0)
+		{
+			/*
+			 * We don't insist that the enlarge worked, but we need some room
+			 */
+			if (conn->inSize - conn->inEnd < 100)
+			{
+				add_error_message(conn, "can not allocate buffer");
+				return -1;
+			}
+		}
+	}
+
+retry:
+	nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+				 conn->inSize - conn->inEnd, 0);
+
+	if (nread < 0)
+	{
+		elog(DEBUG1, "dnrd errno = %d", errno);
+		if (errno == EINTR)
+			goto retry;
+		/* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+		if (errno == EAGAIN)
+			return someread;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+		if (errno == EWOULDBLOCK)
+			return someread;
+#endif
+		/* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+		if (errno == ECONNRESET)
+		{
+			/*
+			 * OK, we are getting a zero read even though select() says ready. This
+			 * means the connection has been closed.  Cope.
+			 */
+			add_error_message(conn,
+							  "data node closed the connection unexpectedly\n"
+				"\tThis probably means the data node terminated abnormally\n"
+							  "\tbefore or while processing the request.\n");
+			conn->state = DN_CONNECTION_STATE_ERROR;	/* No more connection to
+														 * backend */
+			closesocket(conn->sock);
+			conn->sock = NO_SOCKET;
+
+			return -1;
+		}
+#endif
+		add_error_message(conn, "could not receive data from server");
+		return -1;
+
+	}
+
+	if (nread > 0)
+	{
+		conn->inEnd += nread;
+
+		/*
+		 * Hack to deal with the fact that some kernels will only give us back
+		 * 1 packet per recv() call, even if we asked for more and there is
+		 * more available.	If it looks like we are reading a long message,
+		 * loop back to recv() again immediately, until we run out of data or
+		 * buffer space.  Without this, the block-and-restart behavior of
+		 * libpq's higher levels leads to O(N^2) performance on long messages.
+		 *
+		 * Since we left-justified the data above, conn->inEnd gives the
+		 * amount of data already read in the current message.	We consider
+		 * the message "long" once we have acquired 32k ...
+		 */
+		if (conn->inEnd > 32768 &&
+			(conn->inSize - conn->inEnd) >= 8192)
+		{
+			someread = 1;
+			goto retry;
+		}
+		return 1;
+	}
+
+	if (nread == 0)
+	{
+		elog(DEBUG1, "nread returned 0");
+		return EOF;
+	}
+
+	if (someread)
+		return 1;				/* got a zero read after successful tries */
+
+	return 0;
+}
+
+/*
+ * Get one character from the connection buffer and advance cursor
+ */
+static int
+get_char(DataNodeHandle * conn, char *out)
+{
+	if (conn->inCursor < conn->inEnd)
+	{
+		*out = conn->inBuffer[conn->inCursor++];
+		return 0;
+	}
+	return EOF;
+}
+
+/*
+ * Read an integer from the connection buffer and advance cursor
+ */
+static int
+get_int(DataNodeHandle * conn, size_t len, int *out)
+{
+	unsigned short tmp2;
+	unsigned int tmp4;
+
+	if (conn->inCursor + len > conn->inEnd)
+		return EOF;
+
+	switch (len)
+	{
+		case 2:
+			memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+			conn->inCursor += 2;
+			*out = (int) ntohs(tmp2);
+			break;
+		case 4:
+			memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+			conn->inCursor += 4;
+			*out = (int) ntohl(tmp4);
+			break;
+		default:
+			add_error_message(conn, "not supported int size");
+			return EOF;
+	}
+
+	return 0;
+}
+
+/*
+ * Read next message from the connection and update the combiner accordingly
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ */
+static int
+handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState)
+{
+	char		msg_type;
+	int			msg_len;
+	bool		connError = false;
+
+	for (;;)
+	{
+		/* try to read the message, return if not enough data */
+		conn->inCursor = conn->inStart;
+		if (conn->inEnd - conn->inCursor < 5)
+			return EOF;
+
+		if (get_char(conn, &msg_type))
+			return EOF;
+
+		if (get_int(conn, 4, &msg_len))
+			return EOF;
+
+		msg_len -= 4;
+
+		if (conn->inEnd - conn->inCursor < msg_len)
+		{
+			ensure_in_buffer_capacity(conn->inCursor + (size_t) msg_len, conn);
+			return EOF;
+		}
+
+		/* TODO handle other possible responses */
+		switch (msg_type)
+		{
+			case 'C':			/* CommandComplete */
+				/* no need to parse, just move cursor */
+				conn->inCursor += msg_len;
+				conn->state = DN_CONNECTION_STATE_COMPLETED;
+				if (!inErrorState)
+					CombineResponse(combiner, msg_type,
+									conn->inBuffer + conn->inStart + 5,
+									conn->inCursor - conn->inStart - 5);
+
+				break;
+			case 'T':			/* RowDescription */
+			case 'G':			/* CopyInResponse */
+			case 'H':			/* CopyOutResponse */
+			case 'D':			/* DataRow */
+				/* no need to parse, just move cursor */
+				conn->inCursor += msg_len;
+				if (!inErrorState)
+					CombineResponse(combiner, msg_type,
+									conn->inBuffer + conn->inStart + 5,
+									conn->inCursor - conn->inStart - 5);
+				break;
+			case 'E':			/* ErrorResponse */
+				/* no need to parse, just move cursor */
+				conn->inCursor += msg_len;
+				if (!inErrorState)
+					CombineResponse(combiner, msg_type,
+									conn->inBuffer + conn->inStart + 5,
+									conn->inCursor - conn->inStart - 5);
+				conn->inStart = conn->inCursor;
+				connError = inErrorState = true;
+				/* conn->state = DN_CONNECTION_STATE_ERROR;  */
+
+				/*
+				 * Do not return with an error, we still need to consume Z,
+				 * ready-for-query
+				 */
+				break;
+			case 'A':			/* NotificationResponse */
+			case 'N':			/* NoticeResponse */
+				conn->inCursor += msg_len;
+
+				/*
+				 * Ignore these to prevent multiple messages, one from each
+				 * node. Coordinator will send one for DDL anyway
+				 */
+				break;
+			case 'Z':			/* ReadyForQuery */
+				get_char(conn, &conn->transaction_status);
+				conn->state = DN_CONNECTION_STATE_IDLE;
+				conn->inStart = conn->inCursor;
+				/* Now it is ok to flag the connection as having an error */
+				if (connError)
+				{
+					conn->state = DN_CONNECTION_STATE_ERROR;
+					return EOF;
+				}
+				return 0;
+			case 'I':			/* EmptyQuery */
+			default:
+				/* sync lost? */
+				conn->state = DN_CONNECTION_STATE_ERROR;
+				inErrorState = true;
+				return EOF;
+		}
+		conn->inStart = conn->inCursor;
+
+	}
+	return EOF;
+}
+
+
+/*
+ * Send BEGIN command to the Data nodes and receive responses
+ */
+static int
+data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid)
+{
+	int			i;
+	struct timeval *timeout = NULL;
+
+	/* Send BEGIN */
+	for (i = 0; i < conn_count; i++)
+	{
+		if (GlobalTransactionIdIsValid(gxid) && data_node_send_gxid(connections[i], gxid))
+			return EOF;
+
+		if (data_node_send_query(connections[i], "BEGIN"))
+			return EOF;
+	}
+
+	/* Receive responses */
+	if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+		return EOF;
+
+	/* Verify status? */
+
+	return 0;
+}
+
+
+/* Clears the write node list */
+static void
+clear_write_node_list()
+{
+	/* we just malloc once and use counter */
+	if (write_node_list == NULL)
+	{
+		write_node_list = (DataNodeHandle **) malloc(NumDataNodes * sizeof(DataNodeHandle *));
+	}
+	write_node_count = 0;
+}
+
+
+/*
+ * Switch autocommmit mode off, so all subsequent statements will be in the same transaction
+ */
+void
+DataNodeBegin(void)
+{
+	autocommit = false;
+	clear_write_node_list();
+}
+
+
+/*
+ * Commit current transaction, use two-phase commit if necessary
+ */
+int
+DataNodeCommit(CommandDest dest)
+{
+	int			res;
+	int			tran_count;
+	DataNodeHandle *connections[node_count];
+	ResponseCombiner combiner;
+
+	/* Quick check to make sure we have connections */
+	if (node_count == 0)
+		goto finish;
+
+	/* gather connections to commit */
+	tran_count = get_transaction_nodes(connections);
+
+	/*
+	 * If we do not have open transactions we have nothing to commit, just
+	 * report success
+	 */
+	if (tran_count == 0)
+		goto finish;
+
+	combiner = CreateResponseCombiner(tran_count,
+									  COMBINE_TYPE_NONE, dest);
+	res = data_node_commit(tran_count, connections, combiner);
+	if (!ValidateAndCloseCombiner(combiner) || res)
+		return EOF;
+
+finish:
+	/* In autocommit mode statistics is collected in DataNodeExec */
+	if (!autocommit)
+		stat_transaction(node_count);
+	if (!PersistentConnections)
+		release_handles();
+	autocommit = true;
+	clear_write_node_list();
+	return 0;
+}
+
+
+/*
+ * Send COMMIT or PREPARE/COMMIT PREPARED down to the Data nodes and handle responses
+ */
+static int
+data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner)
+{
+	int			i;
+	struct timeval *timeout = NULL;
+	char		buffer[256];
+	GlobalTransactionId gxid = InvalidGlobalTransactionId;
+	int			result = 0;
+
+
+	/* can set this to false to disable temporarily */
+	/* bool do2PC = conn_count > 1; */
+
+	/*
+	 * Only use 2PC if more than one node was written to. Otherwise, just send
+	 * COMMIT to all
+	 */
+	bool		do2PC = write_node_count > 1;
+
+	/* Extra XID for Two Phase Commit */
+	GlobalTransactionId two_phase_xid = 0;
+
+	if (do2PC)
+	{
+		stat_2pc();
+
+		/*
+		 * Formally we should be using GetCurrentGlobalTransactionIdIfAny() here,
+		 * but since we need 2pc, we surely have sent down a command and got
+		 * gxid for it. Hence GetCurrentGlobalTransactionId() just returns
+		 * already allocated gxid
+		 */
+/* #ifdef PGXC_COORD  */
+		gxid = GetCurrentGlobalTransactionId();
+/* #endif */
+
+		sprintf(buffer, "PREPARE TRANSACTION 'T%d'", gxid);
+		/* Send PREPARE */
+		for (i = 0; i < conn_count; i++)
+		{
+			if (data_node_send_query(connections[i], buffer))
+				return EOF;
+		}
+
+		/* Receive responses */
+		if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+			return EOF;
+
+		/* Reset combiner */
+		if (!ValidateAndResetCombiner(combiner))
+			return EOF;
+	}
+
+	if (!do2PC)
+		strcpy(buffer, "COMMIT");
+	else
+	{
+		sprintf(buffer, "COMMIT PREPARED 'T%d'", gxid);
+
+		/* We need to use a new xid, the data nodes have reset */
+		two_phase_xid = BeginTranGTM();
+		for (i = 0; i < conn_count; i++)
+		{
+			if (data_node_send_gxid(connections[i], two_phase_xid))
+			{
+				add_error_message(connections[i], "Can not send request");
+				result = EOF;
+				goto finish;
+			}
+		}
+	}
+
+	/* Send COMMIT */
+	for (i = 0; i < conn_count; i++)
+	{
+		if (data_node_send_query(connections[i], buffer))
+		{
+			result = EOF;
+			goto finish;
+		}
+	}
+
+	/* Receive responses */
+	if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+		result = EOF;
+
+finish:
+	if (do2PC)
+		CommitTranGTM((GlobalTransactionId) two_phase_xid);
+
+	return result;
+}
+
+
+/*
+ * Rollback current transaction
+ */
+int
+DataNodeRollback(CommandDest dest)
+{
+	int			res = 0;
+	int			tran_count;
+	DataNodeHandle *connections[node_count];
+	ResponseCombiner combiner;
+	int			i;
+
+	/* Quick check to make sure we have connections */
+	if (node_count == 0)
+		goto finish;
+
+	/* gather connections to rollback */
+	tran_count = get_transaction_nodes(connections);
+
+	/*
+	 * If we do not have open transactions we have nothing to rollback just
+	 * report success
+	 */
+	if (tran_count == 0)
+		goto finish;
+
+	combiner = CreateResponseCombiner(tran_count,
+									  COMBINE_TYPE_NONE, dest);
+	res = data_node_rollback(tran_count, connections, combiner);
+
+	/* Assume connection got cleaned up. Reset so we can reuse without error. */
+	for (i = 0; i < tran_count; i++)
+	{
+		connections[i]->transaction_status = 'I';
+		connections[i]->state = DN_CONNECTION_STATE_IDLE;
+	}
+
+	if (!ValidateAndCloseCombiner(combiner) || res)
+		res = EOF;
+
+finish:
+	/* In autocommit mode statistics is collected in DataNodeExec */
+	if (!autocommit)
+		stat_transaction(node_count);
+	if (!PersistentConnections)
+		release_handles();
+	autocommit = true;
+	clear_write_node_list();
+	return res;
+}
+
+
+/* Release all data node connections back to pool and release occupied memory */
+static void
+release_handles(void)
+{
+	int			i;
+
+	if (node_count == 0)
+		return;
+
+	PoolManagerReleaseConnections();
+	for (i = 0; i < NumDataNodes; i++)
+	{
+		DataNodeHandle *handle = &handles[i];
+
+		if (handle->sock != NO_SOCKET)
+			data_node_free(handle);
+	}
+
+	node_count = 0;
+}
+
+
+/*
+ * Send ROLLBACK command down to the Data nodes and handle responses
+ */
+static int
+data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner)
+{
+	int			i;
+	struct timeval *timeout = NULL;
+	int			result = 0;
+
+	/* Send ROLLBACK - */
+	for (i = 0; i < conn_count; i++)
+	{
+		if (data_node_send_query(connections[i], "ROLLBACK"))
+			result = EOF;
+	}
+
+	/* Receive responses */
+	if (data_node_receive_responses(conn_count, connections, timeout, combiner))
+		return EOF;
+
+	/* Verify status? */
+	return 0;
+}
+
+
+/*
+ * Execute specified statement on specified Data nodes, combine responses and
+ * send results back to the client
+ */
+int
+DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot,
+		   bool force_autocommit, List *simple_aggregates, bool is_read_only)
+{
+	int			i;
+	int			j;
+	int			conn_count = list_length(nodelist) == 0 ? NumDataNodes : list_length(nodelist);
+	struct timeval *timeout = NULL;		/* wait forever */
+	ResponseCombiner combiner;
+	int			res;
+	int			newCount = 0;
+	bool		need_tran;
+	bool		found;
+	GlobalTransactionId gxid = InvalidGlobalTransactionId;
+	DataNodeHandle *newConnections[conn_count];
+	DataNodeHandle **connections;
+
+	if (conn_count == 0)
+		return EOF;
+
+	connections = get_handles(nodelist);
+	if (!connections)
+		return EOF;
+
+	if (force_autocommit)
+		need_tran = false;
+	else
+		need_tran = !autocommit || conn_count > 1;
+
+	elog(DEBUG1, "autocommit = %s, conn_count = %d, need_tran = %s", autocommit ? "true" : "false", conn_count, need_tran ? "true" : "false");
+
+	stat_statement();
+	if (autocommit)
+		stat_transaction(conn_count);
+
+	/* We normally clear for transactions, but if autocommit, clear here, too */
+	if (autocommit == true)
+	{
+		clear_write_node_list();
+	}
+
+	/* Check status of connections */
+
+	/*
+	 * We want to track new "write" nodes, and new nodes in the current
+	 * transaction whether or not they are write nodes.
+	 */
+	if (!is_read_only && write_node_count < NumDataNodes)
+	{
+		for (i = 0; i < conn_count; i++)
+		{
+			found = false;
+			for (j = 0; j < write_node_count && !found; j++)
+			{
+				if (write_node_list[j] == connections[i])
+					found = true;
+			}
+			if (!found)
+			{
+				/* Add to transaction wide-list */
+				write_node_list[write_node_count++] = connections[i];
+				/* Add to current statement list */
+				newConnections[newCount++] = connections[i];
+			}
+		}
+		/* Check connection state is DN_CONNECTION_STATE_IDLE */
+	}
+
+	gxid = GetCurrentGlobalTransactionId();
+
+	if (!GlobalTransactionIdIsValid(gxid))
+	{
+		pfree(connections);
+		return EOF;
+	}
+	if (newCount > 0 && need_tran)
+	{
+		combiner = CreateResponseCombiner(newCount, COMBINE_TYPE_NONE, DestNone);
+
+		/* Start transaction on connections where it is not started */
+		res = data_node_begin(newCount, newConnections, combiner, gxid);
+		if (!ValidateAndCloseCombiner(combiner) || res)
+		{
+			pfree(connections);
+			return EOF;
+		}
+	}
+
+	/* Send query to nodes */
+	for (i = 0; i < conn_count; i++)
+	{
+		/* If explicit transaction is needed gxid is already sent */
+		if (!need_tran && data_node_send_gxid(connections[i], gxid))
+		{
+			add_error_message(connections[i], "Can not send request");
+			pfree(connections);
+			return EOF;
+		}
+		if (snapshot && data_node_send_snapshot(connections[i], snapshot))
+		{
+			add_error_message(connections[i], "Can not send request");
+			pfree(connections);
+			return EOF;
+		}
+		if (data_node_send_query(connections[i], query) != 0)
+		{
+			add_error_message(connections[i], "Can not send request");
+			pfree(connections);
+			return EOF;
+		}
+	}
+
+	combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM, dest);
+	AssignCombinerAggregates(combiner, simple_aggregates);
+
+	/* Receive responses */
+	res = data_node_receive_responses(conn_count, connections, timeout, combiner);
+	if (!ValidateAndCloseCombiner(combiner) || res)
+	{
+		if (autocommit)
+		{
+			if (need_tran)
+				DataNodeRollback(DestNone);
+			else if (!PersistentConnections)
+				release_handles();
+		}
+
+		pfree(connections);
+		return EOF;
+	}
+
+	if (autocommit)
+	{
+		if (need_tran)
+			DataNodeCommit(DestNone);	/* PGXCTODO - call CommitTransaction()
+										 * instead? */
+		else if (!PersistentConnections)
+			release_handles();
+	}
+
+	/* Verify status? */
+	pfree(connections);
+	return 0;
+}
+
+
+/*
+ * Ensure specified amount of data can fit to the incoming buffer and
+ * increase it if necessary
+ */
+static int
+ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle)
+{
+	int			newsize = handle->inSize;
+	char	   *newbuf;
+
+	if (bytes_needed <= (size_t) newsize)
+		return 0;
+
+	do
+	{
+		newsize *= 2;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = repalloc(handle->inBuffer, newsize);
+		if (newbuf)
+		{
+			/* repalloc succeeded */
+			handle->inBuffer = newbuf;
+			handle->inSize = newsize;
+			return 0;
+		}
+	}
+
+	newsize = handle->inSize;
+	do
+	{
+		newsize += 8192;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = repalloc(handle->inBuffer, newsize);
+		if (newbuf)
+		{
+			/* repalloc succeeded */
+			handle->inBuffer = newbuf;
+			handle->inSize = newsize;
+			return 0;
+		}
+	}
+
+	return EOF;
+}
+
+
+/*
+ * Ensure specified amount of data can fit to the outgoing buffer and
+ * increase it if necessary
+ */
+static int
+ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle)
+{
+	int			newsize = handle->outSize;
+	char	   *newbuf;
+
+	if (bytes_needed <= (size_t) newsize)
+		return 0;
+
+	do
+	{
+		newsize *= 2;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = repalloc(handle->outBuffer, newsize);
+		if (newbuf)
+		{
+			/* repalloc succeeded */
+			handle->outBuffer = newbuf;
+			handle->outSize = newsize;
+			return 0;
+		}
+	}
+
+	newsize = handle->outSize;
+	do
+	{
+		newsize += 8192;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = repalloc(handle->outBuffer, newsize);
+		if (newbuf)
+		{
+			/* repalloc succeeded */
+			handle->outBuffer = newbuf;
+			handle->outSize = newsize;
+			return 0;
+		}
+	}
+
+	return EOF;
+}
+
+
+/*
+ * Send specified amount of data from the outgoing buffer over the connection
+ */
+static int
+send_some(DataNodeHandle * handle, int len)
+{
+	char	   *ptr = handle->outBuffer;
+	int			remaining = handle->outEnd;
+	int			result = 0;
+
+	/* while there's still data to send */
+	while (len > 0)
+	{
+		int			sent;
+
+#ifndef WIN32
+		sent = send(handle->sock, ptr, len, 0);
+#else
+		/*
+		 * Windows can fail on large sends, per KB article Q201213. The failure-point
+		 * appears to be different in different versions of Windows, but 64k should
+		 * always be safe.
+		 */
+		sent = send(handle->sock, ptr, Min(len, 65536), 0);
+#endif
+
+		if (sent < 0)
+		{
+			/*
+			 * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+			 * EPIPE or ECONNRESET, assume we've lost the backend connection
+			 * permanently.
+			 */
+			switch (errno)
+			{
+#ifdef EAGAIN
+				case EAGAIN:
+					break;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+				case EWOULDBLOCK:
+					break;
+#endif
+				case EINTR:
+					continue;
+
+				case EPIPE:
+#ifdef ECONNRESET
+				case ECONNRESET:
+#endif
+					add_error_message(handle, "server closed the connection unexpectedly\n"
+					"\tThis probably means the server terminated abnormally\n"
+							  "\tbefore or while processing the request.\n");
+
+					/*
+					 * We used to close the socket here, but that's a bad idea
+					 * since there might be unread data waiting (typically, a
+					 * NOTICE message from the backend telling us it's
+					 * committing hara-kiri...).  Leave the socket open until
+					 * pqReadData finds no more data can be read.  But abandon
+					 * attempt to send data.
+					 */
+					handle->outEnd = 0;
+					return -1;
+
+				default:
+					add_error_message(handle, "could not send data to server");
+					/* We don't assume it's a fatal error... */
+					handle->outEnd = 0;
+					return -1;
+			}
+		}
+		else
+		{
+			ptr += sent;
+			len -= sent;
+			remaining -= sent;
+		}
+
+		if (len > 0)
+		{
+			/*
+			 * We did not send it all
+			 * return 1 to indicate that data is still pending.
+			 */
+			result = 1;
+			break;
+		}
+	}
+
+	/* shift the remaining contents of the buffer */
+	if (remaining > 0)
+		memmove(handle->outBuffer, ptr, remaining);
+	handle->outEnd = remaining;
+
+	return result;
+}
+
+
+/*
+ * Send specified statement down to the Data node
+ */
+static int
+data_node_send_query(DataNodeHandle * handle, const char *query)
+{
+	int			strLen = strlen(query) + 1;
+
+	/* size + strlen */
+	int			msgLen = 4 + strLen;
+
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+	{
+		add_error_message(handle, "out of memory");
+		return EOF;
+	}
+
+	handle->outBuffer[handle->outEnd++] = 'Q';
+	msgLen = htonl(msgLen);
+	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+	handle->outEnd += 4;
+	memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+	handle->outEnd += strLen;
+
+	/* We need response right away, so send immediately */
+	if (send_some(handle, handle->outEnd) < 0)
+		return EOF;
+
+	handle->state = DN_CONNECTION_STATE_BUSY;
+
+	return 0;
+}
+
+
+/*
+ * Send the GXID down to the Data node
+ */
+static int
+data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid)
+{
+	int			msglen = 8;
+	int			i32;
+
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+	{
+		add_error_message(handle, "out of memory");
+		return EOF;
+	}
+
+	handle->outBuffer[handle->outEnd++] = 'g';
+	msglen = htonl(msglen);
+	memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+	handle->outEnd += 4;
+	i32 = htonl(gxid);
+	memcpy(handle->outBuffer + handle->outEnd, &i32, 4);
+	handle->outEnd += 4;
+
+	return 0;
+}
+
+
+/*
+ * Send the snapshot down to the Data node
+ */
+static int
+data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot)
+{
+	int			msglen;
+	int			nval;
+	int			i;
+
+	/* calculate message length */
+	msglen = 20;
+	if (snapshot->xcnt > 0)
+		msglen += snapshot->xcnt * 4;
+
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+	{
+		add_error_message(handle, "out of memory");
+		return EOF;
+	}
+
+	handle->outBuffer[handle->outEnd++] = 's';
+	msglen = htonl(msglen);
+	memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+	handle->outEnd += 4;
+
+	nval = htonl(snapshot->xmin);
+	memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+	handle->outEnd += 4;
+
+	nval = htonl(snapshot->xmax);
+	memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+	handle->outEnd += 4;
+
+	nval = htonl(snapshot->recent_global_xmin);
+	memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+	handle->outEnd += 4;
+
+	nval = htonl(snapshot->xcnt);
+	memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+	handle->outEnd += 4;
+
+	for (i = 0; i < snapshot->xcnt; i++)
+	{
+		nval = htonl(snapshot->xip[i]);
+		memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+		handle->outEnd += 4;
+	}
+
+	return 0;
+}
+
+/*
+ * Add another message to the list of errors to be returned back to the client
+ * at the convenient time
+ */
+static void
+add_error_message(DataNodeHandle * handle, const char *message)
+{
+	handle->transaction_status = 'E';
+	handle->state = DN_CONNECTION_STATE_ERROR;
+	if (handle->error)
+	{
+		/* PGXCTODO append */
+	}
+	else
+	{
+		handle->error = pstrdup(message);
+	}
+}
+
+/*
+ * for specified list return array of DataNodeHandles
+ * acquire from pool if needed.
+ * the lenth of returned array is the same as of nodelist
+ * Special case is empty or NIL nodeList, in this case return all the nodes.
+ * The returned list should be pfree'd when no longer needed.
+ */
+static DataNodeHandle **
+get_handles(List *nodelist)
+{
+	DataNodeHandle **result;
+	ListCell   *node_list_item;
+	List	   *allocate = NIL;
+
+	/* index of the result array */
+	int			i = 0;
+
+	/* If node list is empty execute request on current nodes */
+	if (list_length(nodelist) == 0)
+	{
+		/*
+		 * We do not have to zero the array - on success all items will be set
+		 * to correct pointers, on error the array will be freed
+		 */
+		result = (DataNodeHandle **) palloc(NumDataNodes * sizeof(DataNodeHandle *));
+		if (!result)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		for (i = 0; i < NumDataNodes; i++)
+		{
+			result[i] = &handles[i];
+			if (handles[i].sock == NO_SOCKET)
+				allocate = lappend_int(allocate, i + 1);
+		}
+	}
+	else
+	{
+		/*
+		 * We do not have to zero the array - on success all items will be set
+		 * to correct pointers, on error the array will be freed
+		 */
+		result = (DataNodeHandle **) palloc(list_length(nodelist) * sizeof(DataNodeHandle *));
+		if (!result)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		i = 0;
+		foreach(node_list_item, nodelist)
+		{
+			int			node = node_list_item->data.int_value;
+
+			if (node > NumDataNodes || node <= 0)
+				elog(ERROR, "Node number: %d passed is not a known node", node);
+			result[i++] = &handles[node - 1];
+			if (handles[node - 1].sock == NO_SOCKET)
+				allocate = lappend_int(allocate, node);
+		}
+	}
+
+	if (allocate)
+	{
+		int			j = 0;
+		int		   *fds = PoolManagerGetConnections(allocate);
+
+		if (!fds)
+		{
+			pfree(result);
+			list_free(allocate);
+			return NULL;
+		}
+		foreach(node_list_item, allocate)
+		{
+			int			node = node_list_item->data.int_value;
+			int			fdsock = fds[j++];
+
+			data_node_init(&handles[node - 1], fdsock);
+			node_count++;
+		}
+		pfree(fds);
+		list_free(allocate);
+	}
+
+	return result;
+}
+
+
+/*
+ * Return handles involved into current transaction, to run commit or rollback
+ * on them, as requested.
+ * Transaction is not started on nodes when read-only statement is executed
+ * on it, so we do not have to commit or rollback on those nodes.
+ * Parameter should point to array able to store at least node_count pointers
+ * to a DataNodeHandle structure.
+ * The function returns number of pointers written to the connections array.
+ * Remaining items in the array, if any, will be kept unchanged
+ */
+static int
+get_transaction_nodes(DataNodeHandle ** connections)
+{
+	int			tran_count = 0;
+	int			i;
+
+	if (node_count)
+	{
+		for (i = 0; i < NumDataNodes; i++)
+		{
+			if (handles[i].sock != NO_SOCKET && handles[i].transaction_status != 'I')
+				connections[tran_count++] = &handles[i];
+		}
+	}
+
+	return tran_count;
+}
+
+
+/*
+ * Called when the backend is ending.
+ */
+void
+DataNodeCleanAndRelease(int code, Datum arg)
+{
+	/* Rollback on Data Nodes */
+	if (IsTransactionState())
+	{
+		DataNodeRollback(DestNone);
+
+		/* Rollback on GTM if transaction id opened. */
+		RollbackTranGTM((GlobalTransactionId) GetCurrentTransactionIdIfAny());
+	}
+
+	/* Release data node connections */
+	release_handles();
+
+	/* Close connection with GTM */
+	CloseGTM();
+
+	/* Dump collected statistics to the log */
+	stat_log();
+}
diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c
new file mode 100644
index 0000000000..03b785f954
--- /dev/null
+++ b/src/backend/pgxc/pool/poolcomm.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolcomm.c
+ *
+ *	  Communication functions between the pool manager and session
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group 
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include "c.h"
+#include "pgxc/poolcomm.h"
+#include "utils/elog.h"
+#include "miscadmin.h"
+
+static int	pool_recvbuf(PoolPort * port);
+static int	pool_discardbytes(PoolPort * port, size_t len);
+
+#ifdef HAVE_UNIX_SOCKETS
+
+#define POOLER_UNIXSOCK_PATH(path, port, sockdir) \
+	snprintf(path, sizeof(path), "%s/.s.PGPOOL.%d", \
+			((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
+			DEFAULT_PGSOCKET_DIR, \
+			(port))
+
+static char sock_path[MAXPGPATH];
+
+static int	Lock_AF_UNIX(unsigned short port, const char *unixSocketName);
+#endif
+
+/*
+ * Open server socket on specified port to accept connection from sessions
+ */
+int
+pool_listen(unsigned short port, const char *unixSocketName)
+{
+	int			fd,
+				len;
+	struct sockaddr_un unix_addr;
+
+#ifdef HAVE_UNIX_SOCKETS
+	if (Lock_AF_UNIX(port, unixSocketName) < 0)
+		return -1;
+
+	/* create a Unix domain stream socket */
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	/* fill in socket address structure */
+	memset(&unix_addr, 0, sizeof(unix_addr));
+	unix_addr.sun_family = AF_UNIX;
+	strcpy(unix_addr.sun_path, sock_path);
+	len = sizeof(unix_addr.sun_family) +
+		strlen(unix_addr.sun_path) + 1;
+
+	/* bind the name to the descriptor */
+	if (bind(fd, (struct sockaddr *) & unix_addr, len) < 0)
+		return -1;
+
+	/* tell kernel we're a server */
+	if (listen(fd, 5) < 0)
+		return -1;
+
+	return fd;
+#else
+	/* TODO support for non-unix platform */
+	ereport(FATAL,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			 errmsg("pool manager only supports UNIX socket")));
+	return -1;
+#endif
+}
+
+#ifdef HAVE_UNIX_SOCKETS
+static int
+Lock_AF_UNIX(unsigned short port, const char *unixSocketName)
+{
+	POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName);
+
+	CreateSocketLockFile(sock_path, true);
+
+	unlink(sock_path);
+
+	return 0;
+}
+#endif
+
+/*
+ * Connect to pooler listening on specified port
+ */
+int
+pool_connect(unsigned short port, const char *unixSocketName)
+{
+	int			fd,
+				len;
+	struct sockaddr_un unix_addr;
+
+#ifdef HAVE_UNIX_SOCKETS
+	/* create a Unix domain stream socket */
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	/* fill socket address structure w/server's addr */
+	POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName);
+
+	memset(&unix_addr, 0, sizeof(unix_addr));
+	unix_addr.sun_family = AF_UNIX;
+	strcpy(unix_addr.sun_path, sock_path);
+	len = sizeof(unix_addr.sun_family) +
+		strlen(unix_addr.sun_path) + 1;
+
+	if (connect(fd, (struct sockaddr *) & unix_addr, len) < 0)
+		return -1;
+
+	return fd;
+#else
+	/* TODO support for non-unix platform */
+	ereport(FATAL,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			 errmsg("pool manager only supports UNIX socket")));
+	return -1;
+#endif
+}
+
+
+/*
+ * Get one byte from the buffer, read data from the connection if buffer is empty
+ */
+int
+pool_getbyte(PoolPort * port)
+{
+	while (port->RecvPointer >= port->RecvLength)
+	{
+		if (pool_recvbuf(port)) /* If nothing in buffer, then recv some */
+			return EOF;			/* Failed to recv data */
+	}
+	return (unsigned char) port->RecvBuffer[port->RecvPointer++];
+}
+
+
+/*
+ * Get one byte from the buffer if it is not empty
+ */
+int
+pool_pollbyte(PoolPort * port)
+{
+	if (port->RecvPointer >= port->RecvLength)
+	{
+		return EOF;				/* Empty buffer */
+	}
+	return (unsigned char) port->RecvBuffer[port->RecvPointer++];
+}
+
+
+/*
+ * Read pooler protocol message from the buffer.
+ */
+int
+pool_getmessage(PoolPort * port, StringInfo s, int maxlen)
+{
+	int32		len;
+
+	resetStringInfo(s);
+
+	/* Read message length word */
+	if (pool_getbytes(port, (char *) &len, 4) == EOF)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("unexpected EOF within message length word")));
+		return EOF;
+	}
+
+	len = ntohl(len);
+
+	if (len < 4 ||
+		(maxlen > 0 && len > maxlen))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("invalid message length")));
+		return EOF;
+	}
+
+	len -= 4;					/* discount length itself */
+
+	if (len > 0)
+	{
+		/*
+		 * Allocate space for message.	If we run out of room (ridiculously
+		 * large message), we will elog(ERROR)
+		 */
+		PG_TRY();
+		{
+			enlargeStringInfo(s, len);
+		}
+		PG_CATCH();
+		{
+			if (pool_discardbytes(port, len) == EOF)
+				ereport(ERROR,
+						(errcode(ERRCODE_PROTOCOL_VIOLATION),
+						 errmsg("incomplete message from client")));
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+		/* And grab the message */
+		if (pool_getbytes(port, s->data, len) == EOF)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("incomplete message from client")));
+			return EOF;
+		}
+		s->len = len;
+		/* Place a trailing null per StringInfo convention */
+		s->data[len] = '\0';
+	}
+
+	return 0;
+}
+
+
+/* --------------------------------
+ * pool_getbytes - get a known number of bytes from connection
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pool_getbytes(PoolPort * port, char *s, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		while (port->RecvPointer >= port->RecvLength)
+		{
+			if (pool_recvbuf(port))		/* If nothing in buffer, then recv
+										 * some */
+				return EOF;		/* Failed to recv data */
+		}
+		amount = port->RecvLength - port->RecvPointer;
+		if (amount > len)
+			amount = len;
+		memcpy(s, port->RecvBuffer + port->RecvPointer, amount);
+		port->RecvPointer += amount;
+		s += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+
+/* --------------------------------
+ * pool_discardbytes - discard a known number of bytes from connection
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pool_discardbytes(PoolPort * port, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		while (port->RecvPointer >= port->RecvLength)
+		{
+			if (pool_recvbuf(port))		/* If nothing in buffer, then recv
+										 * some */
+				return EOF;		/* Failed to recv data */
+		}
+		amount = port->RecvLength - port->RecvPointer;
+		if (amount > len)
+			amount = len;
+		port->RecvPointer += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+
+/* --------------------------------
+ * pool_recvbuf - load some bytes into the input buffer
+ *
+ * returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pool_recvbuf(PoolPort * port)
+{
+	if (port->RecvPointer > 0)
+	{
+		if (port->RecvLength > port->RecvPointer)
+		{
+			/* still some unread data, left-justify it in the buffer */
+			memmove(port->RecvBuffer, port->RecvBuffer + port->RecvPointer,
+					port->RecvLength - port->RecvPointer);
+			port->RecvLength -= port->RecvPointer;
+			port->RecvPointer = 0;
+		}
+		else
+			port->RecvLength = port->RecvPointer = 0;
+	}
+
+	/* Can fill buffer from PqRecvLength and upwards */
+	for (;;)
+	{
+		int			r;
+
+		r = recv(Socket(*port), port->RecvBuffer + port->RecvLength,
+				 POOL_BUFFER_SIZE - port->RecvLength, 0);
+
+		if (r < 0)
+		{
+			if (errno == EINTR)
+				continue;		/* Ok if interrupted */
+
+			/*
+			 * Report broken connection
+			 */
+			ereport(LOG,
+					(errcode_for_socket_access(),
+					 errmsg("could not receive data from client: %m")));
+			return EOF;
+		}
+		if (r == 0)
+		{
+			/*
+			 * EOF detected.  We used to write a log message here, but it's
+			 * better to expect the ultimate caller to do that.
+			 */
+			return EOF;
+		}
+		/* r contains number of bytes read, so just incr length */
+		port->RecvLength += r;
+		return 0;
+	}
+}
+
+
+/*
+ * Put a known number of bytes into the connection buffer
+ */
+int
+pool_putbytes(PoolPort * port, const char *s, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		/* If buffer is full, then flush it out */
+		if (port->SendPointer >= POOL_BUFFER_SIZE)
+			if (pool_flush(port))
+				return EOF;
+		amount = POOL_BUFFER_SIZE - port->SendPointer;
+		if (amount > len)
+			amount = len;
+		memcpy(port->SendBuffer + port->SendPointer, s, amount);
+		port->SendPointer += amount;
+		s += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+
+/* --------------------------------
+ *		pool_flush		- flush pending output
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pool_flush(PoolPort * port)
+{
+	static int	last_reported_send_errno = 0;
+
+	char	   *bufptr = port->SendBuffer;
+	char	   *bufend = port->SendBuffer + port->SendPointer;
+
+	while (bufptr < bufend)
+	{
+		int			r;
+
+		r = send(Socket(*port), bufptr, bufend - bufptr, 0);
+
+		if (r <= 0)
+		{
+			if (errno == EINTR)
+				continue;		/* Ok if we were interrupted */
+
+			if (errno != last_reported_send_errno)
+			{
+				last_reported_send_errno = errno;
+				ereport(ERROR,
+						(errcode_for_socket_access(),
+						 errmsg("could not send data to client: %m")));
+			}
+
+			/*
+			 * We drop the buffered data anyway so that processing can
+			 * continue, even though we'll probably quit soon.
+			 */
+			port->SendPointer = 0;
+			return EOF;
+		}
+
+		last_reported_send_errno = 0;	/* reset after any successful send */
+		bufptr += r;
+	}
+
+	port->SendPointer = 0;
+	return 0;
+}
+
+
+/*
+ * Put the pooler protocol message into the connection buffer
+ */
+int
+pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len)
+{
+	uint		n32;
+
+	if (pool_putbytes(port, &msgtype, 1))
+		return EOF;
+
+	n32 = htonl((uint32) (len + 4));
+	if (pool_putbytes(port, (char *) &n32, 4))
+		return EOF;
+
+	if (pool_putbytes(port, s, len))
+		return EOF;
+
+	return 0;
+}
+
+/* message code('f'), size(8), node_count */
+#define SEND_MSG_BUFFER_SIZE 9
+
+
+/*
+ * Build up a message carrying file deskriptors and send them over specified
+ * connection
+ */
+int
+pool_sendfds(PoolPort * port, int *fds, int count)
+{
+	struct iovec iov[1];
+	struct msghdr msg;
+	char		buf[SEND_MSG_BUFFER_SIZE];
+	uint		n32;
+	int			controllen = sizeof(struct cmsghdr) + count * sizeof(int);
+	struct cmsghdr *cmptr = NULL;
+
+	buf[0] = 'f';
+	n32 = htonl((uint32) 8);
+	memcpy(buf + 1, &n32, 4);
+	n32 = htonl((uint32) count);
+	memcpy(buf + 5, &n32, 4);
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = SEND_MSG_BUFFER_SIZE;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	if (count == 0)
+	{
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+	}
+	else
+	{
+		if ((cmptr = malloc(controllen)) == NULL)
+			return EOF;
+		cmptr->cmsg_level = SOL_SOCKET;
+		cmptr->cmsg_type = SCM_RIGHTS;
+		cmptr->cmsg_len = controllen;
+		msg.msg_control = (caddr_t) cmptr;
+		msg.msg_controllen = controllen;
+		/* the fd to pass */
+		memcpy(CMSG_DATA(cmptr), fds, count * sizeof(int));
+	}
+
+	if (sendmsg(Socket(*port), &msg, 0) != SEND_MSG_BUFFER_SIZE)
+	{
+		if (cmptr)
+			free(cmptr);
+		return EOF;
+	}
+
+	if (cmptr)
+		free(cmptr);
+
+	return 0;
+}
+
+
+/*
+ * Read a message from the specified connection carrying file descriptors
+ */
+int
+pool_recvfds(PoolPort * port, int *fds, int count)
+{
+	int			r;
+	uint		n32;
+	char		buf[SEND_MSG_BUFFER_SIZE];
+	struct iovec iov[1];
+	struct msghdr msg;
+	int			controllen = sizeof(struct cmsghdr) + count * sizeof(int);
+	struct cmsghdr *cmptr = malloc(controllen);
+
+	if (cmptr == NULL)
+		return EOF;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = SEND_MSG_BUFFER_SIZE;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = (caddr_t) cmptr;
+	msg.msg_controllen = controllen;
+
+	r = recvmsg(Socket(*port), &msg, 0);
+	if (r < 0)
+	{
+		/*
+		 * Report broken connection
+		 */
+		ereport(ERROR,
+				(errcode_for_socket_access(),
+				 errmsg("could not receive data from client: %m")));
+		goto failure;
+	}
+	else if (r == 0)
+	{
+		goto failure;
+	}
+	else if (r != SEND_MSG_BUFFER_SIZE)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("incomplete message from client")));
+		goto failure;
+	}
+
+	/* Verify response */
+	if (buf[0] != 'f')
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("unexpected message code")));
+		goto failure;
+	}
+
+	memcpy(&n32, buf + 1, 4);
+	n32 = ntohl(n32);
+	if (n32 != 8)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("invalid message size")));
+		goto failure;
+	}
+
+	/*
+	 * If connection count is 0 it means pool does not have connections
+	 * to  fulfill request. Otherwise number of returned connections
+	 * should be equal to requested count. If it not the case consider this
+	 * a protocol violation. (Probably connection went out of sync)
+	 */
+	memcpy(&n32, buf + 5, 4);
+	n32 = ntohl(n32);
+	if (n32 == 0)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("failed to acquire connections")));
+		goto failure;
+	}
+
+	if (n32 != count)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("unexpected connection count")));
+		goto failure;
+	}
+
+	memcpy(fds, CMSG_DATA(cmptr), count * sizeof(int));
+	free(cmptr);
+	return 0;
+failure:
+	free(cmptr);
+	return EOF;
+}
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
new file mode 100644
index 0000000000..02e5ddd5cd
--- /dev/null
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -0,0 +1,1403 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolmgr.c
+ *
+ *	  Connection pool manager handles connections to DataNodes
+ *
+ * The pooler runs as a separate process and is forked off from a 
+ * coordinator postmaster. If the coordinator needs a connection from a 
+ * data node, it asks for one from the pooler, which maintains separate
+ * pools for each data node. A group of connections can be requested in
+ * a single request, and the pooler returns a list of file descriptors 
+ * to use for the connections.
+ *
+ * Note the current implementation does not yet shrink the pool over time
+ * as connections are idle.  Also, it does not queue requests; if a 
+ * connection is unavailable, it will simply fail. This should be implemented
+ * one day, although there is a chance for deadlocks. For now, limiting
+ * connections should be done between the application and coordinator.
+ * Still, this is useful to avoid having to re-establish connections to the
+ * data nodes all the time for multiple coordinator backend sessions.
+ *
+ * The term "agent" here refers to a session manager, one for each backend
+ * coordinator connection to the pooler. It will contain a list of connections
+ * allocated to a session, at most one per data node.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <signal.h>
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgxc/poolmgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "pgxc/locator.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#include "postmaster/postmaster.h"		/* For UnixSocketDir */
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+/* Configuration options */
+int			NumDataNodes = 2;
+int			MinPoolSize = 1;
+int			MaxPoolSize = 100;
+int			PoolerPort = 6667;
+
+bool		PersistentConnections = false;
+
+/* The memory context */
+static MemoryContext PoolerMemoryContext = NULL;
+
+/* Connection info */
+char	   *DataNodeHosts = NULL;
+char	   *DataNodePorts = NULL;
+char	   *DataNodeUsers = NULL;
+char	   *DataNodePwds = NULL;
+
+/* Connection info list */
+static DataNodeConnectionInfo *connectionInfos;
+
+/* Pool to all the databases (linked list) */
+static DatabasePool *databasePools = NULL;
+
+/* PoolAgents */
+static int	agentCount = 0;
+static PoolAgent **poolAgents;
+
+static PoolHandle *Handle = NULL;
+
+static int	server_fd = -1;
+
+static void agent_init(PoolAgent * agent, const char *database, List *nodes);
+static void agent_destroy(PoolAgent * agent);
+static void agent_create(void);
+static void agent_handle_input(PoolAgent * agent, StringInfo s);
+static DatabasePool *create_database_pool(const char *database, List *nodes);
+static void insert_database_pool(DatabasePool * pool);
+static int	destroy_database_pool(const char *database);
+static DatabasePool *find_database_pool(const char *database);
+static DatabasePool *remove_database_pool(const char *database);
+static int *agent_acquire_connections(PoolAgent * agent, List *nodelist);
+static DataNodePoolSlot *acquire_connection(DatabasePool * dbPool, int node);
+static void agent_release_connections(PoolAgent * agent, bool clean);
+static void release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean);
+static void destroy_slot(DataNodePoolSlot * slot);
+static void grow_pool(DatabasePool * dbPool, int index);
+static void destroy_node_pool(DataNodePool * node_pool);
+static void PoolerLoop(void);
+
+/* Signal handlers */
+static void pooler_die(SIGNAL_ARGS);
+static void pooler_quickdie(SIGNAL_ARGS);
+
+/* Check status of connection */
+extern int	pqReadReady(PGconn * conn);
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t shutdown_requested = false;
+
+
+/* 
+ * Initialize internal structures 
+ */
+int
+PoolManagerInit()
+{
+	char	   *rawstring;
+	List	   *elemlist;
+	ListCell   *l;
+	int			i;
+	MemoryContext old_context;
+
+	elog(DEBUG1, "Pooler process is started: %d", getpid());
+
+	/*
+	 * Set up memory context for the pooler
+	 */
+	PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext,
+												"PoolerMemoryContext",
+												ALLOCSET_DEFAULT_MINSIZE,
+												ALLOCSET_DEFAULT_INITSIZE,
+												ALLOCSET_DEFAULT_MAXSIZE);
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.	(pool manager probably never has any
+	 * child processes, but for consistency we make all postmaster child
+	 * processes do this.)
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+	/*
+	 * Properly accept or ignore signals the postmaster might send us
+	 */
+	pqsignal(SIGINT, pooler_die);
+	pqsignal(SIGTERM, pooler_die);
+	pqsignal(SIGQUIT, pooler_quickdie);
+	pqsignal(SIGHUP, SIG_IGN);
+	/* TODO other signal handlers */
+
+	/* We allow SIGQUIT (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+	sigdelset(&BlockSig, SIGQUIT);
+#else
+	BlockSig &= ~(sigmask(SIGQUIT));
+#endif
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/* Allocate pooler structures in the Pooler context */
+	old_context = MemoryContextSwitchTo(PoolerMemoryContext);
+
+	poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
+	if (poolAgents == NULL)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+
+	connectionInfos = (DataNodeConnectionInfo *) palloc(NumDataNodes * sizeof(DataNodeConnectionInfo));
+	if (connectionInfos == NULL)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+
+	/* Need a modifiable copy */
+	rawstring = pstrdup(DataNodeHosts);
+
+	/* Parse string into list of identifiers */
+	if (!SplitIdentifierString(rawstring, ',', &elemlist))
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_hosts\"")));
+	}
+
+	i = 0;
+	foreach(l, elemlist)
+	{
+		char	   *curhost = (char *) lfirst(l);
+
+		connectionInfos[i].host = pstrdup(curhost);
+		if (connectionInfos[i].host == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+		/* Ignore extra entries, if any */
+		if (++i == NumDataNodes)
+			break;
+	}
+	list_free(elemlist);
+	pfree(rawstring);
+
+	/* Validate */
+	if (i == 0)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_hosts\"")));
+	}
+	else if (i == 1)
+	{
+		/* Copy all values from first */
+		for (; i < NumDataNodes; i++)
+		{
+			connectionInfos[i].host = pstrdup(connectionInfos[0].host);
+			if (connectionInfos[i].host == NULL)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			}
+		}
+	}
+	else if (i < NumDataNodes)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_hosts\"")));
+	}
+
+	/* Need a modifiable copy */
+	rawstring = pstrdup(DataNodePorts);
+
+	/* Parse string into list of identifiers */
+	if (!SplitIdentifierString(rawstring, ',', &elemlist))
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_ports\"")));
+	}
+
+	i = 0;
+	foreach(l, elemlist)
+	{
+		char	   *curport = (char *) lfirst(l);
+
+		connectionInfos[i].port = pstrdup(curport);
+		if (connectionInfos[i].port == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+		/* Ignore extra entries, if any */
+		if (++i == NumDataNodes)
+			break;
+	}
+	list_free(elemlist);
+	pfree(rawstring);
+
+	/* Validate */
+	if (i == 0)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_ports\"")));
+	}
+	else if (i == 1)
+	{
+		/* Copy all values from first */
+		for (; i < NumDataNodes; i++)
+		{
+			connectionInfos[i].port = pstrdup(connectionInfos[0].port);
+			if (connectionInfos[i].port == NULL)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			}
+		}
+	}
+	else if (i < NumDataNodes)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_ports\"")));
+	}
+
+	rawstring = pstrdup(DataNodeUsers);
+
+	/* Parse string into list of identifiers */
+	if (!SplitIdentifierString(rawstring, ',', &elemlist))
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_users\"")));
+	}
+
+	i = 0;
+	foreach(l, elemlist)
+	{
+		char	   *curuser = (char *) lfirst(l);
+
+		connectionInfos[i].uname = pstrdup(curuser);
+		if (connectionInfos[i].uname == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+		/* Ignore extra entries, if any */
+		if (++i == NumDataNodes)
+			break;
+	}
+	list_free(elemlist);
+	pfree(rawstring);
+
+	/* Validate */
+	if (i == 0)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_users\"")));
+	}
+	else if (i == 1)
+	{
+		/* Copy all values from first */
+		for (; i < NumDataNodes; i++)
+		{
+			connectionInfos[i].uname = pstrdup(connectionInfos[0].uname);
+			if (connectionInfos[i].uname == NULL)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			}
+		}
+	}
+	else if (i < NumDataNodes)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_users\"")));
+	}
+
+	rawstring = pstrdup(DataNodePwds);
+
+	/* Parse string into list of identifiers */
+	if (!SplitIdentifierString(rawstring, ',', &elemlist))
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_passwords\"")));
+	}
+
+	i = 0;
+	foreach(l, elemlist)
+	{
+		char	   *curpassword = (char *) lfirst(l);
+
+		connectionInfos[i].password = pstrdup(curpassword);
+		if (connectionInfos[i].password == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+		/* Ignore extra entries, if any */
+		if (++i == NumDataNodes)
+			break;
+	}
+	list_free(elemlist);
+	pfree(rawstring);
+
+	/* Validate */
+	if (i == 0)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_passwords\"")));
+	}
+	else if (i == 1)
+	{
+		/* Copy all values from first */
+		for (; i < NumDataNodes; i++)
+		{
+			connectionInfos[i].password = pstrdup(connectionInfos[0].password);
+			if (connectionInfos[i].password == NULL)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			}
+		}
+	}
+	else if (i < NumDataNodes)
+	{
+		/* syntax error in list */
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid list syntax for \"data_node_passwords\"")));
+	}
+
+	PoolerLoop();
+	return 0;
+}
+
+
+/* 
+ * Destroy internal structures 
+ */
+int
+PoolManagerDestroy(void)
+{
+	int			status = 0;
+
+	if (PoolerMemoryContext)
+	{
+		MemoryContextDelete(PoolerMemoryContext);
+		PoolerMemoryContext = NULL;
+	}
+
+	return status;
+}
+
+
+/*
+ * Get handle to pool manager
+ * Invoked from Postmaster's main loop just before forking off new session
+ * Returned PoolHandle structure will be inherited by session process
+ */
+PoolHandle *
+GetPoolManagerHandle(void)
+{
+	PoolHandle *handle;
+	int			fdsock;
+
+	/* Connect to the pooler */
+	fdsock = pool_connect(PoolerPort, UnixSocketDir);
+	if (fdsock < 0)
+	{
+		int			saved_errno = errno;
+
+		ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				 errmsg("failed to connect to pool manager: %m")));
+		errno = saved_errno;
+		return NULL;
+	}
+
+	/* Allocate handle */
+	/*
+	 * XXX we may change malloc here to palloc but first ensure
+	 * the CurrentMemoryContext is properly set.
+	 * The handle allocated just before new session is forked off and
+	 * inherited by the session process. It should remain valid for all
+	 * the session lifetime.
+	 */
+	handle = (PoolHandle *) malloc(sizeof(PoolHandle));
+	if (!handle)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+		return NULL;
+	}
+
+	handle->port.fdsock = fdsock;
+	handle->port.RecvLength = 0;
+	handle->port.RecvPointer = 0;
+	handle->port.SendPointer = 0;
+
+	return handle;
+}
+
+
+/*
+ * Close handle
+ */
+void
+PoolManagerCloseHandle(PoolHandle * handle)
+{
+	close(Socket(handle->port));
+	free(handle);
+}
+
+
+/*
+ * Create agent
+ */
+static void
+agent_create(void)
+{
+	int			new_fd;
+	PoolAgent  *agent;
+
+	new_fd = accept(server_fd, NULL, NULL);
+	if (new_fd < 0)
+	{
+		int			saved_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				 errmsg("pool manager failed to accept connection: %m")));
+		errno = saved_errno;
+		return;
+	}
+
+	/* Allocate agent */
+	agent = (PoolAgent *) palloc(sizeof(PoolAgent));
+	if (!agent)
+	{
+		close(new_fd);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+		return;
+	}
+
+	agent->port.fdsock = new_fd;
+	agent->port.RecvLength = 0;
+	agent->port.RecvPointer = 0;
+	agent->port.SendPointer = 0;
+	agent->pool = NULL;
+	agent->connections = NULL;
+
+	/* Append new agent to the list */
+	poolAgents[agentCount++] = agent;
+}
+
+
+/*
+ * Associate session with specified database and respective connection pool
+ * Invoked from Session process
+ */
+void
+PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes)
+{
+	Assert(Handle);
+	Assert(database);
+
+	/* Save the handle */
+	Handle = handle;
+
+	/* Send database name followed by \0 terminator */
+	pool_putmessage(&handle->port, 'c', database, strlen(database) + 1);
+	pool_flush(&handle->port);
+}
+
+
+/* 
+ * Init PoolAgent 
+*/
+static void
+agent_init(PoolAgent * agent, const char *database, List *nodes)
+{
+	Assert(agent);
+	Assert(database);
+	Assert(list_length(nodes) > 0);
+
+	/* disconnect if we still connected */
+	if (agent->pool)
+		agent_release_connections(agent, false);
+
+	/* find database */
+	agent->pool = find_database_pool(database);
+
+	/* create if not found */
+	if (agent->pool == NULL)
+		agent->pool = create_database_pool(database, nodes);
+}
+
+
+/* 
+ * Destroy PoolAgent 
+ */
+static void
+agent_destroy(PoolAgent * agent)
+{
+	int			i;
+
+	Assert(agent);
+
+	close(Socket(agent->port));
+
+	/* Discard connections if any remaining */
+	if (agent->pool)
+		agent_release_connections(agent, false);
+
+	/* find agent in the list */
+	for (i = 0; i < agentCount; i++)
+	{
+		if (poolAgents[i] == agent)
+		{
+			/* free memory */
+			if (agent->connections)
+			{
+				pfree(agent->connections);
+				agent->connections = NULL;
+			}
+			pfree(agent);
+			/* shrink the list and move last agent into the freed slot */
+			if (i < --agentCount)
+				poolAgents[i] = poolAgents[agentCount];
+			/* only one match is expected so exit */
+			break;
+		}
+	}
+}
+
+
+/* 
+ * Release handle to pool manager 
+ */
+void
+PoolManagerDisconnect(PoolHandle * handle)
+{
+	Assert(handle);
+
+	pool_putmessage(&handle->port, 'd', NULL, 0);
+	pool_flush(&Handle->port);
+
+	close(Socket(handle->port));
+
+	pfree(handle);
+}
+
+
+/* 
+ * Get pooled connections 
+ */
+int *
+PoolManagerGetConnections(List *nodelist)
+{
+	int			i;
+	ListCell   *nodelist_item;
+	int		   *fds;
+	int			nodes[list_length(nodelist) + 1];
+
+	Assert(Handle);
+	Assert(list_length(nodelist) > 0);
+
+	/* Prepare end send message to pool manager */
+	nodes[0] = htonl(list_length(nodelist));
+	i = 1;
+	foreach(nodelist_item, nodelist)
+	{
+		nodes[i++] = htonl(nodelist_item->data.int_value);
+	}
+	pool_putmessage(&Handle->port, 'g', (char *) nodes, sizeof(int) * (list_length(nodelist) + 1));
+	pool_flush(&Handle->port);
+	/* Receive response */
+	fds = (int *) palloc(sizeof(int) * list_length(nodelist));
+	if (fds == NULL)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+	if (pool_recvfds(&Handle->port, fds, list_length(nodelist)))
+	{
+		pfree(fds);
+		return NULL;
+	}
+	return fds;
+}
+
+
+/*
+ * Handle messages to agent
+ */
+static void
+agent_handle_input(PoolAgent * agent, StringInfo s)
+{
+	int			qtype;
+	const char *database;
+	int			nodecount;
+	List	   *nodelist = NIL;
+	int		   *fds;
+	int			i;
+
+	qtype = pool_getbyte(&agent->port);
+	/*
+	 * We can have multiple messages, so handle them all
+	 */
+	for (;;)
+	{
+		switch (qtype)
+		{
+			case 'c':			/* CONNECT */
+				pool_getmessage(&agent->port, s, 0);
+				database = pq_getmsgstring(s);
+				agent_init(agent, database, GetAllNodes());
+				pq_getmsgend(s);
+				break;
+			case 'd':			/* DISCONNECT */
+				pool_getmessage(&agent->port, s, 4);
+				agent_destroy(agent);
+				pq_getmsgend(s);
+				break;
+			case 'g':			/* GET CONNECTIONS */
+				pool_getmessage(&agent->port, s, 4 * NumDataNodes + 8);
+				nodecount = pq_getmsgint(s, 4);
+				for (i = 0; i < nodecount; i++)
+				{
+					nodelist = lappend_int(nodelist, pq_getmsgint(s, 4));
+				}
+				pq_getmsgend(s);
+				/*
+				 * In case of error agent_acquire_connections will log
+				 * the error and return NULL
+				 */
+				fds = agent_acquire_connections(agent, nodelist);
+				list_free(nodelist);
+				pool_sendfds(&agent->port, fds, fds ? nodecount : 0);
+				if (fds)
+					pfree(fds);
+				break;
+			case 'r':			/* RELEASE CONNECTIONS */
+				pool_getmessage(&agent->port, s, 4);
+				pq_getmsgend(s);
+				agent_release_connections(agent, true);
+				break;
+			default:			/* EOF or protocol violation */
+				agent_destroy(agent);
+				return;
+		}
+		/* avoid reading from connection */
+		if ((qtype = pool_pollbyte(&agent->port)) == EOF)
+			break;
+	}
+}
+
+
+/* 
+ * acquire connection
+ */
+static int *
+agent_acquire_connections(PoolAgent * agent, List *nodelist)
+{
+	int			i;
+	int		   *result;
+	ListCell   *nodelist_item;
+
+	Assert(agent);
+	Assert(nodelist);
+
+	/* Allocate memory */
+	result = (int *) palloc(list_length(nodelist) * sizeof(int));
+	if (result == NULL)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+
+	/* initialize connection if it is not initialized yet */
+	if (!agent->connections)
+	{
+		agent->connections = (DataNodePoolSlot **) palloc(NumDataNodes * sizeof(DataNodePoolSlot *));
+		if (!agent->connections)
+		{
+			pfree(result);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+			return NULL;
+		}
+
+		for (i = 0; i < NumDataNodes; i++)
+			agent->connections[i] = NULL;
+	}
+
+	/* Initialize result */
+	i = 0;
+	foreach(nodelist_item, nodelist)
+	{
+		int			node = nodelist_item->data.int_value;
+
+		/* Acquire from the pool if none */
+		if (agent->connections[node - 1] == NULL)
+		{
+			DataNodePoolSlot *slot = acquire_connection(agent->pool, node);
+
+			/* Handle failure */
+			if (slot == NULL)
+			{
+				pfree(result);
+				return NULL;
+			}
+
+			/* Store in the descriptor */
+			agent->connections[node - 1] = slot;
+		}
+
+		result[i++] = PQsocket((PGconn *) agent->connections[node - 1]->conn);
+	}
+
+	return result;
+}
+
+
+/* 
+ * Retun connections back to the pool 
+ */
+void
+PoolManagerReleaseConnections()
+{
+	Assert(Handle);
+
+	pool_putmessage(&Handle->port, 'r', NULL, 0);
+	pool_flush(&Handle->port);
+}
+
+
+/*
+ * Release connections
+ */
+static void
+agent_release_connections(PoolAgent * agent, bool clean)
+{
+	int			i;
+
+	if (!agent->connections)
+		return;
+
+	/* Enumerate connections */
+	for (i = 0; i < NumDataNodes; i++)
+	{
+		DataNodePoolSlot *slot;
+
+		slot = agent->connections[i];
+
+		/* Release connection */
+		if (slot)
+		{
+			release_connection(agent->pool, slot, i, clean);
+		}
+		agent->connections[i] = NULL;
+	}
+}
+
+
+/*
+ * Create new empty pool for a database and insert into the list
+ * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
+ * error and POOL_WEXIST if poll for this database already exist
+ */
+static DatabasePool *
+create_database_pool(const char *database, List *nodes)
+{
+	DatabasePool *databasePool;
+	int			i;
+	ListCell   *l;
+
+	Assert(nodes && nodes->length > 0);
+
+	/* check if exist */
+	databasePool = find_database_pool(database);
+	if (databasePool)
+	{
+		/* already exist */
+		return databasePool;
+	}
+
+	/* Allocate memory */
+	databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+	if (!databasePool)
+	{
+		/* out of memory */
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+		return NULL;
+	}
+
+	 /* Copy the database name */ ;
+	databasePool->database = pstrdup(database);
+	if (!databasePool->database)
+	{
+		/* out of memory */
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+		pfree(databasePool);
+		return NULL;
+	}
+
+	/* Init next reference */
+	databasePool->next = NULL;
+
+	/* Init data node pools */
+	databasePool->nodePools = (DataNodePool **) palloc(NumDataNodes * sizeof(DataNodePool **));
+	if (!databasePool->nodePools)
+	{
+		/* out of memory */
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+		pfree(databasePool->database);
+		pfree(databasePool);
+		return NULL;
+	}
+	for (i = 0; i < NumDataNodes; i++)
+		databasePool->nodePools[i] = NULL;
+
+	foreach(l, nodes)
+	{
+		int			nodeid = lfirst_int(l);
+
+		grow_pool(databasePool, nodeid - 1);
+	}
+
+	/* Insert into the list */
+	insert_database_pool(databasePool);
+
+	return databasePool;
+}
+
+
+/*
+ * Destroy the pool and free memory
+ */
+static int
+destroy_database_pool(const char *database)
+{
+	DatabasePool *databasePool;
+	int			i;
+
+	/* Delete from the list */
+	databasePool = remove_database_pool(database);
+	if (databasePool)
+	{
+		if (databasePool->nodePools)
+		{
+			for (i = 0; i < NumDataNodes; i++)
+				if (databasePool->nodePools[i])
+					destroy_node_pool(databasePool->nodePools[i]);
+			pfree(databasePool->nodePools);
+		}
+		/* free allocated memory */
+		pfree(databasePool->database);
+		pfree(databasePool);
+		return 1;
+	}
+	return 0;
+}
+
+
+/* 
+ * Insert new database pool to the list 
+ */
+static void
+insert_database_pool(DatabasePool * databasePool)
+{
+	Assert(databasePool);
+
+	/* Reference existing list or null the tail */
+	if (databasePools)
+		databasePool->next = databasePools;
+	else
+		databasePool->next = NULL;
+
+	/* Update head pointer */
+	databasePools = databasePool;
+}
+
+
+/* 
+ * Find pool for specified database in the list 
+ */
+static DatabasePool
+*
+find_database_pool(const char *database)
+{
+	DatabasePool *databasePool;
+
+	/* Scan the list */
+	databasePool = databasePools;
+	while (databasePool)
+	{
+
+		/* if match break the loop and return */
+		if (strcmp(database, databasePool->database) == 0)
+			break;
+		databasePool = databasePool->next;
+
+	}
+	return databasePool;
+}
+
+
+/* 
+ * Remove pool for specified database from the list 
+ */
+static DatabasePool
+*
+remove_database_pool(const char *database)
+{
+	DatabasePool *databasePool,
+			   *prev;
+
+	/* Scan the list */
+	databasePool = databasePools;
+	prev = NULL;
+	while (databasePool)
+	{
+
+		/* if match break the loop and return */
+		if (strcmp(database, databasePool->database) == 0)
+			break;
+		prev = databasePool;
+		databasePool = databasePool->next;
+	}
+
+	/* if found */
+	if (databasePool)
+	{
+
+		/* Remove entry from chain or update head */
+		if (prev)
+			prev->next = databasePool->next;
+		else
+			databasePools = databasePool->next;
+
+
+		databasePool->next = NULL;
+	}
+	return databasePool;
+}
+
+/*
+ * Acquire connection
+ */
+static DataNodePoolSlot *
+acquire_connection(DatabasePool * dbPool, int node)
+{
+	DataNodePool *nodePool;
+	DataNodePoolSlot *slot;
+
+	Assert(dbPool);
+	Assert(0 <= node && node < NumDataNodes);
+
+	slot = NULL;
+	/* Find referenced node pool */
+	nodePool = dbPool->nodePools[node - 1];
+	if (nodePool == NULL || nodePool->freeSize == 0)
+	{
+		grow_pool(dbPool, node - 1);
+		nodePool = dbPool->nodePools[node - 1];
+	}
+
+	/* Check available connections */
+	if (nodePool && nodePool->freeSize > 0)
+	{
+		int			poll_result;
+
+		while (nodePool->freeSize > 0)
+		{
+			slot = nodePool->slot[--(nodePool->freeSize)];
+
+	retry:
+			/* Make sure connection is ok */
+			poll_result = pqReadReady(slot->conn);
+
+			if (poll_result == 0)
+			{
+				/* ok, no data */
+				break;
+			}
+			else if (poll_result < 0)
+			{
+				if (errno == EAGAIN || errno == EINTR)
+					goto retry;
+
+				elog(WARNING, "Error in checking connection, errno = %d", errno);
+			}
+			else
+			{
+				elog(WARNING, "Unexpected data on connection, cleaning.");
+			}
+
+			destroy_slot(slot);
+			/* Decrement current max pool size */
+			(nodePool->size)--;
+			/* Ensure we are not below minimum size */
+			grow_pool(dbPool, node - 1);
+		}
+	}
+	else
+	{
+		/* report problem */
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("connection pool is empty")));
+	}
+	return slot;
+}
+
+
+/*
+ * release connection from specified pool and slot
+ */
+static void
+release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean)
+{
+	DataNodePool *nodePool;
+
+	Assert(dbPool);
+	Assert(slot);
+	Assert(0 <= index && index < NumDataNodes);
+
+	/* Find referenced node pool */
+	nodePool = dbPool->nodePools[index];
+	if (nodePool == NULL)
+	{
+		/* report problem */
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("database does not use node %d", (index + 1))));
+		return;
+	}
+
+	/* return or discard */
+	if (clean)
+	{
+		/* Insert the slot into the array and increase pool size */
+		nodePool->slot[(nodePool->freeSize)++] = slot;
+	}
+	else
+	{
+		elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+		destroy_slot(slot);
+		/* Decrement pool size */
+		(nodePool->size)--;
+		/* Ensure we are not below minimum size */
+		grow_pool(dbPool, index);
+	}
+}
+
+
+/*
+ * Increase database pool size
+ */
+static void
+grow_pool(DatabasePool * dbPool, int index)
+{
+	DataNodePool *nodePool;
+
+	Assert(dbPool);
+	Assert(0 <= index && index < NumDataNodes);
+
+	/* Find referenced node pool */
+	nodePool = dbPool->nodePools[index];
+	if (!nodePool)
+	{
+		/* Allocate new DBNode Pool */
+		nodePool = (DataNodePool *) palloc(sizeof(DataNodePool));
+		if (!nodePool)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		/* initialize it */
+		nodePool->connstr = DataNodeConnStr(
+											connectionInfos[index].host,
+											connectionInfos[index].port,
+											dbPool->database,
+											connectionInfos[index].uname,
+											connectionInfos[index].password);
+
+		if (!nodePool->connstr)
+		{
+			pfree(nodePool);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		nodePool->slot = (DataNodePoolSlot **) palloc(MaxPoolSize * sizeof(DataNodePoolSlot *));
+		if (!nodePool->slot)
+		{
+			pfree(nodePool);
+			pfree(nodePool->connstr);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+		memset(nodePool->slot, 0, MaxPoolSize * sizeof(DataNodePoolSlot *));
+		nodePool->freeSize = 0;
+		nodePool->size = 0;
+
+		/* and insert into the array */
+		dbPool->nodePools[index] = nodePool;
+	}
+
+	while (nodePool->size < MinPoolSize || (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize))
+	{
+		DataNodePoolSlot *slot;
+
+		/* Allocate new slot */
+		slot = (DataNodePoolSlot *) palloc(sizeof(DataNodePoolSlot));
+		if (slot == NULL)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		/* Establish connection */
+		slot->conn = DataNodeConnect(nodePool->connstr);
+		if (!DataNodeConnected(slot->conn))
+		{
+			destroy_slot(slot);
+			ereport(LOG,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("failed to connect to data node")));
+			break;
+		}
+
+		/* Insert at the end of the pool */
+		nodePool->slot[(nodePool->freeSize)++] = slot;
+
+		/* Increase count of pool size */
+		(nodePool->size)++;
+		elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+			 nodePool->size,
+			 nodePool->connstr);
+	}
+}
+
+
+/*
+ * Destroy pool slot
+ */
+static void
+destroy_slot(DataNodePoolSlot * slot)
+{
+	DataNodeClose(slot->conn);
+	pfree(slot);
+}
+
+
+/*
+ * Destroy node pool
+ */
+static void
+destroy_node_pool(DataNodePool * node_pool)
+{
+	int			i;
+
+	/*
+	 * At this point all agents using connections from this pool should be already closed
+	 * If this not the connections to the data nodes assigned to them remain open, this will
+	 * consume data node resources.
+	 * I believe this is not the case because pool is only destroyed on coordinator shutdown.
+	 * However we should be careful when changing thinds
+	 */
+	elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
+		 node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+	if (node_pool->connstr)
+		pfree(node_pool->connstr);
+
+	if (node_pool->slot)
+	{
+		for (i = 0; i < node_pool->freeSize; i++)
+			destroy_slot(node_pool->slot[i]);
+		pfree(node_pool->slot);
+	}
+}
+
+
+/*
+ * Main handling loop
+ */
+static void
+PoolerLoop(void)
+{
+	StringInfoData input_message;
+
+	server_fd = pool_listen(PoolerPort, UnixSocketDir);
+	if (server_fd == -1)
+	{
+		/* log error */
+		return;
+	}
+	initStringInfo(&input_message);
+	for (;;)
+	{
+		int			nfds;
+		fd_set		rfds;
+		int			retval;
+		int			i;
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive(true))
+			exit(1);
+
+		/* watch for incoming connections */
+		FD_ZERO(&rfds);
+		FD_SET		(server_fd, &rfds);
+
+		nfds = server_fd;
+
+		/* watch for incoming messages */
+		for (i = 0; i < agentCount; i++)
+		{
+			PoolAgent  *agent = poolAgents[i];
+			int			sockfd = Socket(agent->port);
+			FD_SET		(sockfd, &rfds);
+
+			nfds = Max(nfds, sockfd);
+		}
+
+		/* wait for event */
+		retval = select(nfds + 1, &rfds, NULL, NULL, NULL);
+		if (shutdown_requested)
+		{
+			for (i = agentCount - 1; i >= 0; i--)
+			{
+				PoolAgent  *agent = poolAgents[i];
+
+				agent_destroy(agent);
+			}
+			while (databasePools)
+				if (destroy_database_pool(databasePools->database) == 0)
+					break;
+			close(server_fd);
+			exit(0);
+		}
+		if (retval > 0)
+		{
+			/*
+			 * Agent may be removed from the array while processing
+			 * and trailing items are shifted, so scroll downward
+			 * to avoid problem
+			 */
+			for (i = agentCount - 1; i >= 0; i--)
+			{
+				PoolAgent  *agent = poolAgents[i];
+				int			sockfd = Socket(agent->port);
+
+				if (FD_ISSET(sockfd, &rfds))
+					agent_handle_input(agent, &input_message);
+			}
+			if (FD_ISSET(server_fd, &rfds))
+				agent_create();
+		}
+	}
+}
+
+
+/*
+ *
+ */
+static void
+pooler_die(SIGNAL_ARGS)
+{
+	shutdown_requested = true;
+}
+
+
+/*
+ *
+ */
+static void
+pooler_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+	exit(2);
+}
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 3dbf36a6cf..0dd252cb62 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -34,6 +34,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -102,6 +103,13 @@
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+/* COORD */
+#include "pgxc/locator.h"
+#include "pgxc/poolmgr.h"
+#include "access/gtm.h"
+#endif
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/fork_process.h"
@@ -204,6 +212,9 @@ char	   *bonjour_name;
 
 /* PIDs of special child processes; 0 when not running */
 static pid_t StartupPID = 0,
+#ifdef PGXC /* PGXC_COORD */
+			PgPoolerPID = 0,
+#endif /* PGXC_COORD */
 			BgWriterPID = 0,
 			WalWriterPID = 0,
 			AutoVacPID = 0,
@@ -442,6 +453,12 @@ static void ShmemBackendArrayAdd(Backend *bn);
 static void ShmemBackendArrayRemove(Backend *bn);
 #endif   /* EXEC_BACKEND */
 
+#ifdef PGXC /* PGXC_COORD */
+bool isPGXCCoordinator = false;
+bool isPGXCDataNode = false;
+#define StartPoolManager()		StartChildProcess(PoolerProcess)
+#endif
+
 #define StartupDataBase()		StartChildProcess(StartupProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
@@ -461,6 +478,9 @@ PostmasterMain(int argc, char *argv[])
 	int			status;
 	char	   *userDoption = NULL;
 	int			i;
+#ifdef PGXC /* PGXC_COORD */
+	MemoryContext 		oldcontext;
+#endif
 
 	MyProcPid = PostmasterPid = getpid();
 
@@ -506,7 +526,11 @@ PostmasterMain(int argc, char *argv[])
 	 * tcop/postgres.c (the option sets should not conflict) and with the
 	 * common help() function in main/main.c.
 	 */
+#ifdef PGXC
+	while ((opt = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:X-:")) != -1)
+#else
 	while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1)
+#endif
 	{
 		switch (opt)
 		{
@@ -517,6 +541,11 @@ PostmasterMain(int argc, char *argv[])
 			case 'B':
 				SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV);
 				break;
+#ifdef PGXC
+			case 'C':
+				isPGXCCoordinator = true;
+				break;
+#endif 
 
 			case 'D':
 				userDoption = optarg;
@@ -638,6 +667,11 @@ PostmasterMain(int argc, char *argv[])
 				SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV);
 				break;
 
+#ifdef PGXC
+			case 'X':
+				isPGXCDataNode = true;
+				break;
+#endif 
 			case 'c':
 			case '-':
 				{
@@ -673,6 +707,14 @@ PostmasterMain(int argc, char *argv[])
 		}
 	}
 
+#ifdef PGXC
+	if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE)
+	{
+		write_stderr("%s: PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n",
+					 progname);
+		ExitPostmaster(1);
+	}
+#endif
 	/*
 	 * Postmaster accepts no non-option switch arguments.
 	 */
@@ -1037,6 +1079,20 @@ PostmasterMain(int argc, char *argv[])
 	Assert(StartupPID != 0);
 	pmState = PM_STARTUP;
 
+#ifdef PGXC /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	
+		/*
+		 * Initialize the Data Node connection pool
+		 */
+		PgPoolerPID = StartPoolManager();
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+#endif
+
 	status = ServerLoop();
 
 	/*
@@ -1393,6 +1449,11 @@ ServerLoop(void)
 		if (PgStatPID == 0 && pmState == PM_RUN)
 			PgStatPID = pgstat_start();
 
+#ifdef PGXC /* PGXC_COORD */
+		/* If we have lost the pooler, try to start a new one */
+		if (IS_PGXC_COORDINATOR && PgPoolerPID == 0 && pmState == PM_RUN)
+			PgPoolerPID = StartPoolManager();
+#endif
 		/*
 		 * Touch the socket and lock file every 58 minutes, to ensure that
 		 * they are not removed by overzealous /tmp-cleaning tasks.  We assume
@@ -1990,6 +2051,10 @@ SIGHUP_handler(SIGNAL_ARGS)
 		SignalChildren(SIGHUP);
 		if (StartupPID != 0)
 			signal_child(StartupPID, SIGHUP);
+#ifdef PGXC /* PGXC_COORD */
+		if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+			signal_child(PgPoolerPID, SIGHUP);
+#endif
 		if (BgWriterPID != 0)
 			signal_child(BgWriterPID, SIGHUP);
 		if (WalWriterPID != 0)
@@ -2062,6 +2127,11 @@ pmdie(SIGNAL_ARGS)
 				/* and the walwriter too */
 				if (WalWriterPID != 0)
 					signal_child(WalWriterPID, SIGTERM);
+#ifdef PGXC /* PGXC_COORD */
+				/* and the pool manager too */
+				if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+					signal_child(PgPoolerPID, SIGTERM);
+#endif
 				pmState = PM_WAIT_BACKUP;
 			}
 
@@ -2108,6 +2178,11 @@ pmdie(SIGNAL_ARGS)
 				/* and the walwriter too */
 				if (WalWriterPID != 0)
 					signal_child(WalWriterPID, SIGTERM);
+#ifdef PGXC /* PGXC_COORD */
+				/* and the pool manager too */
+				if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+					signal_child(PgPoolerPID, SIGTERM);
+#endif
 				pmState = PM_WAIT_BACKENDS;
 			}
 
@@ -2131,6 +2206,10 @@ pmdie(SIGNAL_ARGS)
 			SignalChildren(SIGQUIT);
 			if (StartupPID != 0)
 				signal_child(StartupPID, SIGQUIT);
+#ifdef PGXC /* PGXC_COORD */
+			if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+				signal_child(PgPoolerPID, SIGQUIT);
+#endif
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGQUIT);
 			if (WalWriterPID != 0)
@@ -2266,6 +2345,10 @@ reaper(SIGNAL_ARGS)
 				PgArchPID = pgarch_start();
 			if (PgStatPID == 0)
 				PgStatPID = pgstat_start();
+#ifdef PGXC /* PGXC_COORD */
+			if (IS_PGXC_COORDINATOR && PgPoolerPID == 0)
+				PgPoolerPID = StartPoolManager();
+#endif
 
 			/* at this point we are really open for business */
 			ereport(LOG,
@@ -2403,6 +2486,21 @@ reaper(SIGNAL_ARGS)
 			continue;
 		}
 
+#ifdef PGXC /* PGXC_COORD */
+		/* 
+		 * Was it the pool manager?  TODO decide how to handle 
+		 * Probably we should restart the system
+		 */
+		if (IS_PGXC_COORDINATOR && pid == PgPoolerPID)
+		{
+			PgPoolerPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("pool manager process"));
+			continue;
+		}
+#endif
+
 		/*
 		 * Else do standard backend child cleanup.
 		 */
@@ -2594,6 +2692,23 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+#ifdef PGXC /* PGXC_COORD */
+	/* Take care of the pool manager too */
+	if (IS_PGXC_COORDINATOR)
+	{
+		if (pid == PgPoolerPID)
+			PgPoolerPID = 0;
+		else if (PgPoolerPID != 0 && !FatalError)
+		{
+			ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) PgPoolerPID)));
+			signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT));
+		}
+	}
+#endif
+
 	/*
 	 * Force a power-cycle of the pgarch process too.  (This isn't absolutely
 	 * necessary, but it seems like a good idea for robustness, and it
@@ -2724,6 +2839,9 @@ PostmasterStateMachine(void)
 		 */
 		if (CountChildren() == 0 &&
 			StartupPID == 0 &&
+#ifdef PGXC /* PGXC_COORD */
+			PgPoolerPID == 0 &&
+#endif
 			(BgWriterPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
 			AutoVacPID == 0)
@@ -2798,6 +2916,9 @@ PostmasterStateMachine(void)
 			PgArchPID == 0 && PgStatPID == 0)
 		{
 			/* These other guys should be dead already */
+#ifdef PGXC /* PGXC_COORD */
+			Assert(PgPoolerPID == 0);
+#endif
 			Assert(StartupPID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(WalWriterPID == 0);
@@ -2942,6 +3063,9 @@ BackendStartup(Port *port)
 {
 	Backend    *bn;				/* for backend cleanup */
 	pid_t		pid;
+#ifdef PGXC /* PGXC_COORD */
+	PoolHandle *pool_handle;
+#endif 
 
 	/*
 	 * Create backend data structure.  Better before the fork() so we can
@@ -2977,12 +3101,31 @@ BackendStartup(Port *port)
 	else
 		bn->child_slot = 0;
 
+#ifdef PGXC /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		pool_handle = GetPoolManagerHandle();
+		if (pool_handle == NULL)
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg("Can not connect to pool manager")));
+			return STATUS_ERROR;
+		}
+	}
+#endif 
+
+
 #ifdef EXEC_BACKEND
 	pid = backend_forkexec(port);
 #else							/* !EXEC_BACKEND */
 	pid = fork_process();
 	if (pid == 0)				/* child */
 	{
+		//// FOR DEBUG
+		printf("The session started: %d\n", getpid());
+		//sleep(60);
+		//// FOR DEBUG
 		free(bn);
 
 		/*
@@ -3005,11 +3148,25 @@ BackendStartup(Port *port)
 		/* Perform additional initialization and client authentication */
 		BackendInitialize(port);
 
+#ifdef PGXC /* PGXC_COORD */
+		if (IS_PGXC_COORDINATOR)
+		{
+			/* User is authenticated and dbname is known at this point */
+			PoolManagerConnect(pool_handle, port->database_name, GetAllNodes());
+			InitGTM();
+		}
+#endif 
+
 		/* And run the backend */
 		proc_exit(BackendRun(port));
 	}
 #endif   /* EXEC_BACKEND */
 
+#ifdef PGXC /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+		PoolManagerCloseHandle(pool_handle);
+#endif 
+
 	if (pid < 0)
 	{
 		/* in parent, fork failed */
@@ -4236,6 +4393,12 @@ StartChildProcess(AuxProcType type)
 		errno = save_errno;
 		switch (type)
 		{
+#ifdef PGXC /* PGXC_COORD */
+			case PoolerProcess:
+				ereport(LOG,
+						(errmsg("could not fork pool manager process: %m")));
+				break;
+#endif
 			case StartupProcess:
 				ereport(LOG,
 						(errmsg("could not fork startup process: %m")));
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e71b95c826..8ce8be820e 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -20,6 +20,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -38,6 +39,12 @@
 #include "miscadmin.h"
 #include "storage/procarray.h"
 #include "utils/snapmgr.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_DATANODE */
+#include "postmaster/autovacuum.h"
+#endif
 
 
 /* Our shared memory area */
@@ -90,6 +97,27 @@ static void DisplayXidCache(void);
 #define xc_slow_answer_inc()		((void) 0)
 #endif   /* XIDCACHE_DEBUG */
 
+#ifdef PGXC  /* PGXC_DATANODE */
+typedef enum
+{
+	SNAPSHOT_UNDEFINED,   /* Coordinator has not sent snapshot or not yet connected */
+	SNAPSHOT_LOCAL,       /* Coordinator has instructed data node to build up snapshot from the local procarray */
+	SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */
+	SNAPSHOT_DIRECT       /* Data Node obtained directly from GTM */
+} SnapshotSource;
+
+void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip);
+void UnsetGlobalSnapshotData(void);
+static bool GetSnapshotDataDataNode(Snapshot snapshot);
+static bool GetSnapshotDataCoordinator(Snapshot snapshot);
+/* Global snapshot data */
+static SnapshotSource snapshot_source = SNAPSHOT_UNDEFINED;
+static int gxmin = InvalidTransactionId;
+static int gxmax = InvalidTransactionId;
+static int gxcnt = 0;
+static int *gxip = NULL;
+#endif
+
 
 /*
  * Report shared-memory space needed by CreateSharedProcArray.
@@ -682,6 +710,46 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 
+
+#ifdef PGXC  /* PGXC_DATANODE */
+	/*
+ 	 * The typical case is that the coordinator passes down the snapshot to the
+ 	 * data nodes to use, while it itselfs obtains them from GTM.
+ 	 * The data nodes may however connect directly to GTM themselves to obtain
+ 	 * XID and snapshot information for autovacuum worker threads.
+ 	 */
+	if (IS_PGXC_DATANODE)
+	{
+		if (GetSnapshotDataDataNode(snapshot))
+			return snapshot;
+		/* else fallthrough */
+	} else if (IS_PGXC_COORDINATOR)
+	{
+		if (GetSnapshotDataCoordinator(snapshot))
+			return snapshot;
+		/* else fallthrough */
+	}
+
+	/* If we have no snapshot, we will use a local one.
+	 * If we are in normal mode, we output a warning though.
+	 * We currently fallback and use a local one at initdb time,
+	 * as well as when a new connection occurs.
+	 * IsPostmasterEnvironment - checks for initdb
+	 * IsNormalProcessingMode() - checks for new connections
+	 */
+	 if (IS_PGXC_DATANODE && snapshot_source == SNAPSHOT_UNDEFINED 
+	 		&& IsPostmasterEnvironment && IsNormalProcessingMode())
+	 {
+	 	elog(WARNING, "Do not have a GTM snapshot available");
+	 }
+#endif
+      
+	/*
+	 * Fallback to standard routine, calculate snapshot from local proc arrey
+	 * if no master connection
+	 */
+
+
 	Assert(snapshot != NULL);
 
 	/*
@@ -828,6 +896,9 @@ GetSnapshotData(Snapshot snapshot)
 
 	snapshot->curcid = GetCurrentCommandId(false);
 
+#ifdef PGXC
+	elog(DEBUG1, "Local snapshot is built, xmin: %d, xmax: %d, xcnt: %d, RecentGlobalXmin: %d", xmin, xmax, count, globalxmin);
+#endif 
 	/*
 	 * This is a new snapshot, so set both refcounts are zero, and mark it as
 	 * not copied in persistent memory.
@@ -1400,3 +1471,262 @@ DisplayXidCache(void)
 }
 
 #endif   /* XIDCACHE_DEBUG */
+
+
+#ifdef PGXC  
+/*
+ * Store snapshot data received from the coordinator
+ */
+void 
+SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip)
+{
+	snapshot_source = SNAPSHOT_COORDINATOR;
+	gxmin = xmin;
+	gxmax = xmax;
+	gxcnt = xcnt;
+	if (gxip)
+		free(gxip);
+	gxip = xip;
+	elog (DEBUG1, "global snapshot info: gxmin: %d, gxmax: %d, gxcnt: %d", gxmin, gxmax, gxcnt);
+}
+
+/*
+ * Force datanode to use local snapshot data
+ */
+void
+UnsetGlobalSnapshotData(void)
+{
+	snapshot_source = SNAPSHOT_UNDEFINED;
+	gxmin = InvalidTransactionId;
+	gxmax = InvalidTransactionId;
+	gxcnt = 0;
+	if (gxip)
+		free(gxip);
+	gxip = NULL;
+	elog (DEBUG1, "unset snapshot info");
+}
+
+/*
+ * Get snapshot data for data node
+ * This is usually passed down from the coordinator
+ *
+ * returns whether or not to return immediately with snapshot
+ */
+static bool 
+GetSnapshotDataDataNode(Snapshot snapshot)
+{
+	Assert(IS_PGXC_DATANODE);
+
+
+	if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+	{
+		GTM_Snapshot gtm_snapshot;
+		bool canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable);
+		elog(DEBUG1, "Getting snapshot for autovacuum. Current XID = %d", GetCurrentTransactionId());
+		gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionId(), canbe_grouped);
+
+		if (!gtm_snapshot)
+			ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				errmsg("GTM error, could not obtain snapshot")));
+		else {
+			snapshot_source = SNAPSHOT_DIRECT;
+			gxmin = gtm_snapshot->sn_xmin;
+			gxmax = gtm_snapshot->sn_xmax;
+			gxcnt = gtm_snapshot->sn_xcnt;
+			RecentGlobalXmin = gtm_snapshot->sn_recent_global_xmin;
+			if (gxip)
+				free(gxip);
+			if (gxcnt > 0)
+			{
+				gxip = malloc(gxcnt * 4);
+				if (gxip == NULL)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
+				}
+				memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * 4);
+			}
+			else
+				gxip = NULL;
+			elog(DEBUG1, "for autovacuum from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d",
+				gxmin, gxmax, gxcnt, RecentGlobalXmin);
+		}
+	}
+
+	if ((snapshot_source == SNAPSHOT_COORDINATOR || snapshot_source == SNAPSHOT_DIRECT)
+				&& TransactionIdIsValid(gxmin))
+	{
+		snapshot->xmin = gxmin;
+		snapshot->xmax = gxmax;
+		snapshot->xcnt = gxcnt;
+		/*
+		 * Allocating space for maxProcs xids is usually overkill; numProcs would
+		 * be sufficient.  But it seems better to do the malloc while not holding
+		 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+		 * more subxip storage than is probably needed.
+		 *
+		 * This does open a possibility for avoiding repeated malloc/free: since
+		 * maxProcs does not change at runtime, we can simply reuse the previous
+		 * xip arrays if any.  (This relies on the fact that all callers pass
+		 * static SnapshotData structs.) */
+		if (snapshot->xip == NULL)
+		{
+			ProcArrayStruct *arrayP = procArray;
+			/*
+			 * First call for this snapshot
+			 */
+			snapshot->xip = (TransactionId *)
+				malloc(arrayP->maxProcs * sizeof(TransactionId));
+			if (snapshot->xip == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+
+			Assert(snapshot->subxip == NULL);
+			snapshot->subxip = (TransactionId *)
+				malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+			if (snapshot->subxip == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
+
+		memcpy(snapshot->xip, gxip, gxcnt * sizeof(TransactionId)); 
+		snapshot->curcid = GetCurrentCommandId(false);
+
+		if (!TransactionIdIsValid(MyProc->xmin))
+			MyProc->xmin = TransactionXmin = gxmin;
+
+		/*
+		 * We should update RecentXmin here. But we have recently seen some
+		 * issues with that - so skipping it for the time being.
+		 *
+		 * !!TODO
+		 */
+		RecentXmin = gxmin;
+
+		/* PGXCTODO - set this until we handle subtransactions. */
+		snapshot->subxcnt = 0; 
+
+		/*
+		 * This is a new snapshot, so set both refcounts are zero, and mark it
+		 * as not copied in persistent memory.
+		 */
+		snapshot->active_count = 0;
+		snapshot->regd_count = 0;
+		snapshot->copied = false;
+
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Get snapshot data for coordinator
+ * It will later be passed down to data nodes
+ *
+ * returns whether or not to return immediately with snapshot
+ */
+static bool 
+GetSnapshotDataCoordinator(Snapshot snapshot)
+{
+	bool canbe_grouped;
+	GTM_Snapshot gtm_snapshot;
+
+
+ 	Assert (IS_PGXC_COORDINATOR);
+
+	canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable);
+	gtm_snapshot = GetSnapshotGTM(GetCurrentGlobalTransactionId(), canbe_grouped);
+
+	if (!gtm_snapshot) 
+			ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				errmsg("GTM error, could not obtain snapshot")));
+	else {
+		snapshot->xmin = gtm_snapshot->sn_xmin;
+		snapshot->xmax = gtm_snapshot->sn_xmax;
+		snapshot->recent_global_xmin = gtm_snapshot->sn_recent_global_xmin;
+		snapshot->xcnt = gtm_snapshot->sn_xcnt;
+		elog(DEBUG1, "from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d",
+			snapshot->xmin, snapshot->xmax, snapshot->xcnt, snapshot->recent_global_xmin);
+		/*
+		 * Allocating space for maxProcs xids is usually overkill; numProcs would
+		 * be sufficient.  But it seems better to do the malloc while not holding
+		 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+		 * more subxip storage than is probably needed.
+		 *
+		 * This does open a possibility for avoiding repeated malloc/free: since
+		 * maxProcs does not change at runtime, we can simply reuse the previous
+		 * xip arrays if any.  (This relies on the fact that all callers pass
+		 * static SnapshotData structs.)
+		 */
+		if (snapshot->xip == NULL)
+		{
+			ProcArrayStruct *arrayP = procArray;
+			/*
+			 * First call for this snapshot
+			 */
+			snapshot->xip = (TransactionId *)
+				malloc(Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt) * sizeof(TransactionId));
+			if (snapshot->xip == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			snapshot->max_xcnt = Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt);
+
+			/*
+			 * FIXME
+			 * 
+			 * We really don't support subtransaction in PGXC right now, but
+			 * when we would, we should fix the allocation below
+			 */
+			Assert(snapshot->subxip == NULL);
+			snapshot->subxip = (TransactionId *)
+				malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+
+			if (snapshot->subxip == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
+		else if (snapshot->max_xcnt < gtm_snapshot->sn_xcnt)
+		{
+			snapshot->xip = (TransactionId *)
+				realloc(snapshot->xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId));
+			if (snapshot->xip == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+			snapshot->max_xcnt = gtm_snapshot->sn_xcnt;
+		}
+
+		memcpy(snapshot->xip, gtm_snapshot->sn_xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId)); 
+		snapshot->curcid = GetCurrentCommandId(false);
+
+		if (!TransactionIdIsValid(MyProc->xmin))
+			MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+		/*
+		 * We should update RecentXmin here. But we have recently seen some
+		 * issues with that - so skipping it for the time being.
+		 *
+		 * !!TODO
+		 */
+
+		/* PGXCTODO - set this until we handle subtransactions. */
+		snapshot->subxcnt = 0; 
+		/*
+		 * This is a new snapshot, so set both refcounts are zero, and mark it
+		 * as not copied in persistent memory.
+		 */
+		snapshot->active_count = 0;
+		snapshot->regd_count = 0;
+		snapshot->copied = false;
+		return true;
+	}
+	return false;
+}
+#endif /* PGXC */
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 43e912f5cf..34b63041d1 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -71,7 +72,16 @@
 #include "utils/snapmgr.h"
 #include "mb/pg_wchar.h"
 
-
+#ifdef PGXC
+#include "storage/procarray.h"
+#include "pgxc/pgxc.h"
+#include "access/gtm.h"
+/* PGXC_COORD */
+#include "pgxc/planner.h"
+#include "pgxc/datanode.h"
+/* PGXC_DATANODE */
+#include "access/transam.h" 
+#endif 
 extern int	optind;
 extern char *optarg;
 
@@ -185,6 +195,27 @@ static void SigHupHandler(SIGNAL_ARGS);
 static void log_disconnections(int code, Datum arg);
 
 
+#ifdef PGXC /* PGXC_DATANODE */
+static void pgxc_transaction_stmt (Node *parsetree);
+static List * pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord);
+
+/* ----------------------------------------------------------------
+ *		PG-XC routines
+ * ----------------------------------------------------------------
+ */
+
+/*  
+ * Called when the backend is ending. 
+ */
+static void 
+DataNodeShutdown (int code, Datum arg)
+{
+	/* Close connection with GTM, if active */
+	if (IsAutoVacuumWorkerProcess())
+		CloseGTM();
+}
+#endif
+
 /* ----------------------------------------------------------------
  *		routines to obtain user input
  * ----------------------------------------------------------------
@@ -398,6 +429,11 @@ SocketBackend(StringInfo inBuf)
 						(errcode(ERRCODE_PROTOCOL_VIOLATION),
 						 errmsg("invalid frontend message type %d", qtype)));
 			break;
+#ifdef PGXC /* PGXC_DATANODE */
+		case 'g':
+		case 's':
+			break;
+#endif 
 
 		default:
 
@@ -780,7 +816,6 @@ exec_simple_query(const char *query_string)
 	bool		isTopLevel;
 	char		msec_str[32];
 
-
 	/*
 	 * Report query to various monitoring facilities.
 	 */
@@ -863,6 +898,22 @@ exec_simple_query(const char *query_string)
 		Portal		portal;
 		DestReceiver *receiver;
 		int16		format;
+#ifdef PGXC 
+		Query_Plan  *query_plan;
+		Query_Step  *query_step;
+		bool 		exec_on_coord;
+
+
+		/* 
+		 * By default we do not want data nodes to contact GTM directly,
+		 * it should get this information passed down to it.
+		 */
+		if (IS_PGXC_DATANODE)
+			SetForceXidFromGTM(false);
+
+		exec_on_coord = true;
+		query_plan = NULL;
+#endif
 
 		/*
 		 * Get the command name for use in status display (it also becomes the
@@ -917,15 +968,53 @@ exec_simple_query(const char *query_string)
 		querytree_list = pg_analyze_and_rewrite(parsetree, query_string,
 												NULL, 0);
 
-		plantree_list = pg_plan_queries(querytree_list, 0, NULL);
+#ifdef PGXC /* PGXC_COORD */
+		if (IS_PGXC_COORDINATOR)
+		{
+			if (IsA(parsetree, TransactionStmt))
+				pgxc_transaction_stmt(parsetree);
+
+			else if (IsA(parsetree, ExecDirectStmt))
+				querytree_list = pgxc_execute_direct(parsetree, querytree_list, dest, snapshot_set, &exec_on_coord);
+
+			else 
+			{
+				query_plan = GetQueryPlan(parsetree, query_string, querytree_list);
+			
+				exec_on_coord = query_plan->exec_loc_type & EXEC_ON_COORD;
+			}
+
+			/* First execute on the coordinator, if involved (DDL),  then data nodes */
+		}
+
+		if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+#endif
+			plantree_list = pg_plan_queries(querytree_list, 0, NULL);
 
 		/* Done with the snapshot used for parsing/planning */
+#ifdef PGXC
+		/* In PG-XC, hold on to it a bit longer */
+#else
 		if (snapshot_set)
 			PopActiveSnapshot();
+#endif
 
 		/* If we got a cancel signal in analysis or planning, quit */
 		CHECK_FOR_INTERRUPTS();
 
+#ifdef PGXC 
+		/* PGXC_DATANODE */
+		/* Force getting Xid from GTM if not autovacuum, but a vacuum */
+		if (IS_PGXC_DATANODE && IsA(parsetree, VacuumStmt) && IsPostmasterEnvironment)
+			SetForceXidFromGTM(true);
+
+		/* PGXC_COORD */
+		/* Force getting Xid from GTM if not autovacuum, but a vacuum */
+		/* Skip the Portal stuff on coordinator if command only executes on data nodes */
+		if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+		{
+#endif
+
 		/*
 		 * Create unnamed portal to run the query or queries in. If there
 		 * already is one, silently drop it.
@@ -999,6 +1088,33 @@ exec_simple_query(const char *query_string)
 
 		PortalDrop(portal, false);
 
+#ifdef PGXC 
+		}
+
+		/* PGXC_COORD */
+		/* If the coordinator ran ok, now run on the data nodes if planned */
+		if (IS_PGXC_COORDINATOR)
+		{
+			if (query_plan && (query_plan->exec_loc_type & EXEC_ON_DATA_NODES))
+			{
+				query_step = linitial(query_plan->query_step_list);
+
+				DataNodeExec(query_step->sql_statement, 
+						query_step->nodelist, 
+						dest, 
+						snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(),
+						query_plan->force_autocommit,
+						query_step->simple_aggregates,
+						IsA(parsetree, SelectStmt));
+			}
+	
+			FreeQueryPlan(query_plan);
+		}
+
+		if (snapshot_set)
+			PopActiveSnapshot();
+#endif /* PGXC_COORD */
+
 		if (IsA(parsetree, TransactionStmt))
 		{
 			/*
@@ -1029,6 +1145,11 @@ exec_simple_query(const char *query_string)
 			 */
 			CommandCounterIncrement();
 		}
+#ifdef PGXC /* PGXC_COORD */
+		/* In case of PGXC handling client already received a response */
+		if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE)
+		{
+#endif
 
 		/*
 		 * Tell client that we're done with this query.  Note we emit exactly
@@ -1037,6 +1158,9 @@ exec_simple_query(const char *query_string)
 		 * aborted by error will not send an EndCommand report at all.)
 		 */
 		EndCommand(completionTag, dest);
+#ifdef PGXC /* PGXC_COORD */
+		}
+#endif
 	}							/* end loop over parsetrees */
 
 	/*
@@ -2868,6 +2992,14 @@ PostgresMain(int argc, char *argv[], const char *username)
 	sigjmp_buf	local_sigjmp_buf;
 	volatile bool send_ready_for_query = true;
 
+#ifdef PGXC /* PGXC_DATANODE */
+	/* Snapshot info */
+	int 			xmin;
+	int 			xmax;
+	int				xcnt;
+	int 			*xip;
+#endif
+
 #define PendingConfigOption(name,val) \
 	(guc_names = lappend(guc_names, pstrdup(name)), \
 	 guc_values = lappend(guc_values, pstrdup(val)))
@@ -2948,7 +3080,11 @@ PostgresMain(int argc, char *argv[], const char *username)
 	 * postmaster/postmaster.c (the option sets should not conflict) and with
 	 * the common help() function in main/main.c.
 	 */
+#ifdef PGXC
+	while ((flag = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:Xy:-:")) != -1)
+#else
 	while ((flag = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:y:-:")) != -1)
+#endif
 	{
 		switch (flag)
 		{
@@ -2960,6 +3096,12 @@ PostgresMain(int argc, char *argv[], const char *username)
 				SetConfigOption("shared_buffers", optarg, ctx, gucsource);
 				break;
 
+#ifdef PGXC
+			case 'C':
+				isPGXCCoordinator = true;
+				break;
+#endif 
+
 			case 'D':
 				if (secure)
 					userDoption = optarg;
@@ -3082,7 +3224,11 @@ PostgresMain(int argc, char *argv[], const char *username)
 				SetConfigOption("post_auth_delay", optarg, ctx, gucsource);
 				break;
 
-
+#ifdef PGXC
+			case 'X':
+				isPGXCDataNode = true;
+				break;
+#endif 
 			case 'y':
 
 				/*
@@ -3140,6 +3286,24 @@ PostgresMain(int argc, char *argv[], const char *username)
 		}
 	}
 
+#ifdef PGXC
+	/*
+	 * Make sure we specified the mode if Coordinator or Data Node.
+	 * Allow for the exception of initdb by checking config option
+	 */
+	if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE && IsUnderPostmaster)
+	{
+		ereport(FATAL,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n")));
+	}
+	if (!IsPostmasterEnvironment)
+	{
+		/* Treat it as a data node for initdb to work properly */
+		isPGXCDataNode = true;
+	}
+#endif
+
 	/*
 	 * Process any additional GUC variable settings passed in startup packet.
 	 * These are handled exactly like command-line variables.
@@ -3511,6 +3675,19 @@ PostgresMain(int argc, char *argv[], const char *username)
 	if (!ignore_till_sync)
 		send_ready_for_query = true;	/* initially, or after error */
 
+#ifdef PGXC /* PGXC_COORD */
+	if (IS_PGXC_COORDINATOR)
+	{
+		InitMultinodeExecutor();
+		/* If we exit, first try and clean connections and send to pool */
+		on_proc_exit (DataNodeCleanAndRelease, 0);
+	}
+	if (IS_PGXC_DATANODE)
+	{
+		/* If we exit, first try and clean connection to GTM */
+		on_proc_exit (DataNodeShutdown, 0);
+	}
+#endif
 	/*
 	 * Non-error queries loop here.
 	 */
@@ -3560,6 +3737,15 @@ PostgresMain(int argc, char *argv[], const char *username)
 			}
 
 			ReadyForQuery(whereToSendOutput);
+#ifdef PGXC
+			/* 
+			 * Helps us catch any problems where we did not send down a snapshot 
+			 * when it was expected. 
+			 */
+			if (IS_PGXC_DATANODE)
+				UnsetGlobalSnapshotData();
+#endif
+
 			send_ready_for_query = false;
 		}
 
@@ -3832,6 +4018,42 @@ PostgresMain(int argc, char *argv[], const char *username)
 				 * is still sending data.
 				 */
 				break;
+#ifdef PGXC /* PGXC_DATANODE */
+			case 'g':			/* gxid */
+				{
+					/* Set the GXID we were passed down */
+					TransactionId gxid = (TransactionId) pq_getmsgint(&input_message, 4);
+					elog(DEBUG1, "Received new gxid %u", gxid);
+					SetNextTransactionId(gxid);
+					pq_getmsgend(&input_message);
+				}
+				break;
+
+			case 's':			/* snapshot */
+				/* Set the snapshot we were passed down */
+				xmin = pq_getmsgint(&input_message, 4);
+				xmax = pq_getmsgint(&input_message, 4);
+				RecentGlobalXmin = pq_getmsgint(&input_message, 4);
+				xcnt = pq_getmsgint(&input_message, 4);
+				if (xcnt > 0)
+				{
+					int i;
+					xip = malloc(xcnt * 4);
+					if (xip == NULL)
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_OUT_OF_MEMORY),
+								 errmsg("out of memory")));
+					}
+					for (i = 0; i < xcnt; i++)
+					       xip[i] = pq_getmsgint(&input_message, 4);
+				}
+				else
+					xip = NULL;
+				pq_getmsgend(&input_message);
+				SetGlobalSnapshotData(xmin, xmax, xcnt, xip);
+				break;
+#endif /* PGXC */
 
 			default:
 				ereport(FATAL,
@@ -4023,3 +4245,117 @@ log_disconnections(int code, Datum arg)
 					port->user_name, port->database_name, port->remote_host,
 				  port->remote_port[0] ? " port=" : "", port->remote_port)));
 }
+
+
+#ifdef PGXC
+/*
+ * Handle transaction statements in PG-XC
+ */
+void
+pgxc_transaction_stmt (Node *parsetree)
+{
+	Assert(IS_PGXC_COORDINATOR);
+
+
+	/* Handle transaction statements specially */
+	if (IsA(parsetree, TransactionStmt))
+	{
+			TransactionStmt *stmt = (TransactionStmt *) parsetree;
+
+			switch (stmt->kind)
+			{
+				case TRANS_STMT_BEGIN:
+					/* 
+					 * This does not yet send down a BEGIN,
+					 * we do that "on demand" as data nodes are added 
+					 */
+					DataNodeBegin();
+				break;
+
+				case TRANS_STMT_COMMIT:
+					DataNodeCommit(DestNone);
+					break;
+
+				case TRANS_STMT_ROLLBACK:
+					DataNodeRollback(DestNone);
+					break;
+
+				default:
+					/* Ignore others for prototype */
+					break;
+			}
+	}
+}
+
+
+/*
+ * Handle EXECUTE DIRECT
+ */
+List *
+pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord)
+{
+	List *node_list = NIL;
+	List *parsetree_list;
+	ListCell *node_cell;
+	ExecDirectStmt *execdirect = (ExecDirectStmt *) parsetree;
+	bool on_coord = execdirect->coordinator;
+
+
+	Assert(IS_PGXC_COORDINATOR);
+	Assert(IsA(parsetree, ExecDirectStmt));
+
+	foreach (node_cell, execdirect->nodes)
+	{
+		int node_int = intVal(lfirst(node_cell));
+		node_list = lappend_int(node_list, node_int);	
+	}
+	if (node_list)
+		if (DataNodeExec(execdirect->query,
+					node_list,
+					dest,
+					snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(),
+					FALSE,
+					FALSE,
+					FALSE) != 0)
+			on_coord = false;
+
+	if (on_coord)
+	{
+		/*
+		 * Parse inner statement, like at the begiining of the function
+		 * We do not have to release wrapper trees, the message context 
+		 * will be deleted later
+		 * Also, no need to switch context - current is already 
+		 * 		the MessageContext
+		 */
+		parsetree_list = pg_parse_query(execdirect->query);
+
+		/* We do not want to log or display the inner command */
+
+		/* 
+		 * we do not support complex commands (expanded to multiple 
+		 * parse trees) within EXEC DIRECT
+		 */
+		if (list_length(parsetree_list) != 1)
+		{
+			ereport(ERROR, 
+				   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				    errmsg("Can not execute %s with EXECUTE DIRECT", 
+						execdirect->query)));
+		}
+
+		/*
+		 * Get parse tree from the list
+		 */
+		parsetree = (Node *) lfirst(list_head(parsetree_list));
+
+		/*
+		 * Build new query tree */
+		querytree_list = pg_analyze_and_rewrite(parsetree,
+				execdirect->query, NULL, 0);
+	}
+	*exec_on_coord = on_coord;
+
+	return querytree_list;
+}
+#endif /* PGXC */
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index f51f90f86b..28041c6305 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2069,6 +2069,10 @@ CreateCommandTag(Node *parsetree)
 				}
 			}
 			break;
+		
+		case T_ExecDirectStmt:
+			tag = "EXECUTE DIRECT";
+			break;
 
 		default:
 			elog(WARNING, "unrecognized node type: %d",
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 775865d569..47ee10e682 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -33,6 +34,9 @@
 #include "access/heapam.h"
 #include "access/reloptions.h"
 #include "access/sysattr.h"
+#ifdef PGXC
+#include "access/transam.h"
+#endif
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/index.h"
@@ -54,6 +58,9 @@
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
 #include "rewrite/rewriteDefine.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
@@ -856,6 +863,10 @@ RelationBuildDesc(Oid targetRelId, Relation oldrelation)
 	else
 		relation->trigdesc = NULL;
 
+#ifdef PGXC
+	if (IS_PGXC_COORDINATOR && relation->rd_id >= FirstNormalObjectId)
+		RelationBuildLocator(relation);
+#endif
 	/*
 	 * if it's an index, initialize index-related information
 	 */
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 922c4a626f..5b70df1924 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  *
  * IDENTIFICATION
@@ -49,6 +50,9 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#ifdef PGXC
+#include "catalog/pgxc_class.h"
+#endif
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
@@ -524,6 +528,20 @@ static const struct cachedesc cacheinfo[] = {
 		},
 		64
 	},
+#ifdef PGXC
+	{PgxcClassRelationId,	/* PGXCCLASSRELID */
+		PgxcClassPgxcRelIdIndexId,
+		Anum_pgxc_class_pcrelid,
+		1,
+		{
+			ObjectIdAttributeNumber,
+			0,
+			0,
+			0
+		},
+		1024
+	},
+#endif
 	{ProcedureRelationId,		/* PROCNAMEARGSNSP */
 		ProcedureNameArgsNspIndexId,
 		0,
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 7bdfb67204..7063f6f5f6 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -50,7 +50,6 @@ ProcessingMode Mode = InitProcessing;
 /* Note: we rely on this to initialize as zeroes */
 static char socketLockFile[MAXPGPATH];
 
-
 /* ----------------------------------------------------------------
  *		ignoring system indexes support stuff
  *
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 210bd6ba6a..c9f0a63418 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -7,6 +7,7 @@
  *
  *
  * Copyright (c) 2000-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  * Written by Peter Eisentraut <[email protected]>.
  *
  * IDENTIFICATION
@@ -27,6 +28,9 @@
 #endif
 
 #include "access/gin.h"
+#ifdef PGXC
+#include "access/gtm.h"
+#endif
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
@@ -50,6 +54,11 @@
 #include "parser/parse_type.h"
 #include "parser/scansup.h"
 #include "pgstat.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#include "pgxc/planner.h"
+#include "pgxc/poolmgr.h"
+#endif
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
@@ -532,6 +541,12 @@ const char *const config_group_names[] =
 	gettext_noop("Customized Options"),
 	/* DEVELOPER_OPTIONS */
 	gettext_noop("Developer Options"),
+#ifdef PGXC
+	/* DATA_NODES */
+	gettext_noop("Data Nodes and Connection Pooling"),
+	/* GTM */
+	gettext_noop("GTM Connection"),
+#endif
 	/* help_config wants this array to be null-terminated */
 	NULL
 };
@@ -1220,7 +1235,38 @@ static struct config_bool ConfigureNamesBool[] =
 		&IgnoreSystemIndexes,
 		false, NULL, NULL
 	},
-
+#ifdef PGXC
+	{
+		{"persistent_datanode_connections", PGC_BACKEND, DEVELOPER_OPTIONS,
+			gettext_noop("Session never releases acquired connections."),
+			NULL,
+			GUC_NOT_IN_SAMPLE
+		},
+		&PersistentConnections,
+		false, NULL, NULL
+	},
+	{
+		{"strict_statement_checking", PGC_USERSET, DEVELOPER_OPTIONS,
+			gettext_noop("Forbid statements that are not safe for the cluster"),
+			NULL
+		},
+		&StrictStatementChecking,
+		true, NULL, NULL
+	},
+	{
+		/*
+		 * This is temporary work-around until we allow for a merge-sort of
+		 * ORDER BY.
+		 */
+		{"strict_select_checking", PGC_USERSET, DEVELOPER_OPTIONS,
+			gettext_noop("Forbid if SELECT has ORDER BY"),
+			gettext_noop("and is not safe for the cluster"),
+			GUC_NOT_IN_SAMPLE	
+		},
+		&StrictSelectChecking,
+		false, NULL, NULL
+	},
+#endif
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL
@@ -1255,7 +1301,7 @@ static struct config_int ConfigureNamesInt[] =
 			gettext_noop("This applies to table columns that have not had a "
 				"column-specific target set via ALTER TABLE SET STATISTICS.")
 		},
-		&default_statistics_target,
+		&default_statistics_target, 
 		100, 1, 10000, NULL, NULL
 	},
 	{
@@ -1504,7 +1550,11 @@ static struct config_int ConfigureNamesInt[] =
 			NULL
 		},
 		&max_prepared_xacts,
+#ifdef PGXC
+		10, 0, INT_MAX / 4, NULL, NULL
+#else
 		0, 0, INT_MAX / 4, NULL, NULL
+#endif
 	},
 
 #ifdef LOCK_DEBUG
@@ -1951,7 +2001,63 @@ static struct config_int ConfigureNamesInt[] =
 		&pgstat_track_activity_query_size,
 		1024, 100, 102400, NULL, NULL
 	},
+#ifdef PGXC
+	{
+		{"num_data_nodes", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Number of data nodes."),
+			NULL
+		},
+		&NumDataNodes,
+		2, 1, 65535, NULL, NULL	
+	},	
 
+	{
+		{"min_pool_size", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Initial pool size."),
+			gettext_noop("If number of active connections decreased below this value, "
+						 "new connections are established")
+		},
+		&MinPoolSize,
+		1, 1, 65535, NULL, NULL	
+	},	
+
+	{
+		{"max_pool_size", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Max pool size."),
+			gettext_noop("If number of active connections reaches this value, "
+						 "other connection requests will be refused")
+		},
+		&MaxPoolSize,
+		100, 1, 65535, NULL, NULL	
+	},	
+
+	{
+		{"pooler_port", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Port of the Pool Manager."),
+			NULL
+		},
+		&PoolerPort,
+		6667, 1, 65535, NULL, NULL	
+	},	
+
+	{
+		{"gtm_port", PGC_POSTMASTER, GTM,
+			gettext_noop("Port of GTM."),
+			NULL
+		},
+		&GtmPort,
+		6666, 1, 65535, NULL, NULL	
+	},	
+
+	{
+		{"gtm_coordinator_id", PGC_POSTMASTER, GTM,
+			gettext_noop("The Coordinator Identifier."),
+			NULL
+		},
+		&GtmCoordinatorId,
+		1, 1, INT_MAX, NULL, NULL	
+	},	
+#endif
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL
@@ -2502,6 +2608,65 @@ static struct config_string ConfigureNamesString[] =
 		"pg_catalog.simple", assignTSCurrentConfig, NULL
 	},
 
+#ifdef PGXC
+	{
+		{"preferred_data_nodes", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Preferred data nodes."),
+			gettext_noop("A list of data nodes to read from replicated tables")
+		},
+		&PreferredDataNodes,
+		"", NULL, NULL	
+	},	
+
+	{
+		{"data_node_hosts", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Host names or addresses of data nodes."),
+			gettext_noop("Comma separated list or single value, "
+						 "if all data nodes on the same host")
+		},
+		&DataNodeHosts,
+		"localhost", NULL, NULL
+	},
+
+	{
+		{"data_node_ports", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Port numbers of data nodes."),
+			gettext_noop("Comma separated list or single value, "
+						 "if all data nodes listen on the same port")
+		},
+		&DataNodePorts,
+		"15432,25432", NULL, NULL
+	},
+
+	{
+		{"data_node_users", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("User names or addresses of data nodes."),
+			gettext_noop("Comma separated list or single value, "
+						 "if user names are the same on all data nodes")
+		},
+		&DataNodeUsers,
+		"postgres", NULL, NULL
+	},
+
+	{
+		{"data_node_passwords", PGC_POSTMASTER, DATA_NODES,
+			gettext_noop("Passwords of data nodes."),
+			gettext_noop("Comma separated list or single value, "
+						 "if passwords are the same on all data nodes")
+		},
+		&DataNodePwds,
+		"postgres", NULL, NULL
+	},
+
+	{
+		{"gtm_host", PGC_POSTMASTER, GTM,
+			gettext_noop("Host name or address of GTM"),
+			NULL
+		},
+		&GtmHost,
+		"localhost", NULL, NULL
+	},
+#endif
 #ifdef USE_SSL
 	{
 		{"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3f7b43f0cc..e46670cd91 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -106,7 +106,7 @@
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
-#max_prepared_transactions = 0		# zero disables the feature
+#max_prepared_transactions = 10		# zero disables the feature
 					# (change requires restart)
 # Note:  Increasing max_prepared_transactions costs ~600 bytes of shared memory
 # per transaction slot, plus lock space (see max_locks_per_transaction).
@@ -490,9 +490,61 @@
 
 #transform_null_equals = off
 
+#------------------------------------------------------------------------------
+# DATA NODES AND CONNECTION POOLING
+#------------------------------------------------------------------------------
+
+#pooler_port = 6667			# Pool Manager TCP port
+					# (change requires restart)
+#num_data_nodes = 2			# Number of Data Nodes
+					# (change requires restart)
+#preferred_data_nodes = ''		# List of preferred Data Nodes to read from 
+					# replicated tables. If empty use all the data nodes
+					# (change requires restart)
+#min_pool_size = 1			# Initial pool size
+					# (change requires restart)
+#max_pool_size = 100			# Maximum pool size
+					# (change requires restart)
+#persistent_datanode_connections = off	# Set persistent connection mode for pooler
+					# if set at on, connections taken for coordinator
+					# are not put back to pool
+#data_node_hosts = 'localhost'		# Host names or addresses of data nodes
+					# (change requires restart)
+#data_node_ports = '15432,25432'	# Port numbers of data nodes
+					# (change requires restart)
+#data_node_users = 'postgres'		# User names of data nodes
+					# (change requires restart)
+#data_node_passwords = 'postgres'	# Passwords of data nodes
+					# (change requires restart)
+# Note each adata_node_... value should be either a single value if respective 
+# parameter is the same on all nodes or a comma-separated list, with number of 
+# entries not less then number of nodes end each entry is a value for node with 
+# respective number between 1 and num_data_nodes. If list is longer then 
+# num_data_nodes extra values are ignored.  
 
 #------------------------------------------------------------------------------
+# GTM CONNECTION
+#------------------------------------------------------------------------------
+
+#gtm_host = 'localhost'			# Host name or address of GTM
+					# (change requires restart)
+#gtm_port = 6666			# Port of GTM
+					# (change requires restart)
+#gtm_coordinator_id = 1			# Coordinator identifier
+					# (change requires restart)
+
+##------------------------------------------------------------------------------
+# OTHER PG-XC OPTIONS
+#------------------------------------------------------------------------------
+#strict_statement_checking = on		# Forbid PG-XC-unsafe SQL
+					# Enabling is useful for development
+#strict_select_checking = off		# Temporary; be strict about allowing 
+					# multi-node ORDER BY
+
+
+##------------------------------------------------------------------------------
 # CUSTOMIZED OPTIONS
 #------------------------------------------------------------------------------
 
 #custom_variable_classes = ''		# list of custom variable class names
+
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 3fc9b3880e..0677b09660 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -40,6 +40,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  * Portions taken from FreeBSD.
  *
  * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.172 2009/06/11 14:49:07 momjian Exp $
@@ -62,6 +63,7 @@
 #include "getopt_long.h"
 #include "miscadmin.h"
 
+#include "postgres.h"
 
 /*
  * these values are passed in by makefile defines
@@ -3179,14 +3181,34 @@ main(int argc, char *argv[])
 	strcpy(bin_dir, argv[0]);
 	get_parent_directory(bin_dir);
 
-	printf(_("\nSuccess. You can now start the database server using:\n\n"
-			 "    %s%s%spostgres%s -D %s%s%s\n"
+
+#ifdef PGXC
+	printf(_("\nSuccess.\n You can now start the database server of the Postgres-XC coordinator using:\n\n"
+			 "    %s%s%spostgres%s -C -D %s%s%s\n"
 			 "or\n"
-			 "    %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
+			 "    %s%s%spg_ctl%s start -D %s%s%s -S coordinator -l logfile\n\n"
+			 " You can now start the database server of the Postgres-XC datanode using:\n\n"
+			 "    %s%s%spostgres%s -X -D %s%s%s\n"
+			 "or \n"
+			 "    %s%s%spg_ctl%s start -D %s%s%s -S datanode -l logfile\n\n"),
+	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+		   QUOTE_PATH, pg_data_native, QUOTE_PATH,
+	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+		   QUOTE_PATH, pg_data_native, QUOTE_PATH,
 	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
 		   QUOTE_PATH, pg_data_native, QUOTE_PATH,
 	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
 		   QUOTE_PATH, pg_data_native, QUOTE_PATH);
+#else
+	printf(_("\nSuccess. You can now start the database server of datanode using:\n\n"
+             "    %s%s%spostgres%s -D %s%s%s\n"
+             "or\n"
+             "    %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
+	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+		   QUOTE_PATH, pg_data_native, QUOTE_PATH,
+	   QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
+		   QUOTE_PATH, pg_data_native, QUOTE_PATH);
+#endif
 
 	return 0;
 }
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 40ede2c1a8..3e06bd4132 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -3,6 +3,7 @@
  * pg_ctl --- start/stops/restarts the PostgreSQL server
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/bin/pg_ctl/pg_ctl.c,v 1.111 2009/06/11 14:49:07 momjian Exp $
  *
@@ -58,8 +59,8 @@ typedef enum
 {
 	NO_COMMAND = 0,
 	START_COMMAND,
-	STOP_COMMAND,
 	RESTART_COMMAND,
+	STOP_COMMAND,
 	RELOAD_COMMAND,
 	STATUS_COMMAND,
 	KILL_COMMAND,
@@ -88,6 +89,9 @@ static char *register_username = NULL;
 static char *register_password = NULL;
 static char *argv0 = NULL;
 static bool allow_core_files = false;
+#ifdef PGXC
+static char *pgxcCommand = NULL;
+#endif
 
 static void
 write_stderr(const char *fmt,...)
@@ -357,12 +361,23 @@ start_postmaster(void)
 	 * everything to a shell to process them.
 	 */
 	if (log_file != NULL)
+#ifdef PGXC
+		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
+				postgres_path, pgxcCommand, pgdata_opt, post_opts,
+				DEVNULL, log_file);
+#else
 		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
 				 postgres_path, pgdata_opt, post_opts,
 				 DEVNULL, log_file);
+#endif
 	else
+#ifdef PGXC
+		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
+				postgres_path, pgxcCommand, pgdata_opt, post_opts, DEVNULL);
+#else
 		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
 				 postgres_path, pgdata_opt, post_opts, DEVNULL);
+#endif
 
 	return system(cmd);
 #else							/* WIN32 */
@@ -1520,16 +1535,22 @@ do_help(void)
 	printf(_("%s is a utility to start, stop, restart, reload configuration files,\n"
 			 "report the status of a PostgreSQL server, or signal a PostgreSQL process.\n\n"), progname);
 	printf(_("Usage:\n"));
+#ifdef PGXC
+	printf(_("  %s start   [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
+	printf(_("  %s restart [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
+		 	 "                 [-o \"OPTIONS\"]\n"), progname);
+#else
 	printf(_("  %s start   [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
-	printf(_("  %s stop    [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
 	printf(_("  %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
-			 "                 [-o \"OPTIONS\"]\n"), progname);
+		 	 "                 [-o \"OPTIONS\"]\n"), progname);
+#endif
+	printf(_("  %s stop    [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
 	printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
 	printf(_("  %s status  [-D DATADIR]\n"), progname);
 	printf(_("  %s kill    SIGNALNAME PID\n"), progname);
 #if defined(WIN32) || defined(__CYGWIN__)
 	printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
-		 "                    [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
+		 	 "                    [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
 	printf(_("  %s unregister [-N SERVICENAME]\n"), progname);
 #endif
 
@@ -1537,6 +1558,9 @@ do_help(void)
 	printf(_("  -D, --pgdata DATADIR   location of the database storage area\n"));
 	printf(_("  -s, --silent           only print errors, no informational messages\n"));
 	printf(_("  -t SECS                seconds to wait when using -w option\n"));
+#ifdef PGXC
+	printf(_("  -S NODE-TYPE           can be \"coordinator\" or \"datanode\" (Postgres-XC)\n"));
+#endif
 	printf(_("  -w                     wait until operation completes\n"));
 	printf(_("  -W                     do not wait until operation completes\n"));
 	printf(_("  --help                 show this help, then exit\n"));
@@ -1715,7 +1739,11 @@ main(int argc, char **argv)
 	/* process command-line options */
 	while (optind < argc)
 	{
+#ifdef PGXC
+		while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:S:st:U:wW", long_options, &option_index)) != -1)
+#else
 		while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:st:U:wW", long_options, &option_index)) != -1)
+#endif
 		{
 			switch (c)
 			{
@@ -1759,6 +1787,13 @@ main(int argc, char **argv)
 				case 'P':
 					register_password = xstrdup(optarg);
 					break;
+#ifdef PGXC
+				case 'S':
+					if (strcmp(optarg, "coordinator") == 0)
+						pgxcCommand = strdup("-C");
+					else if (strcmp(optarg, "datanode") == 0)
+						pgxcCommand = strdup("-X");
+#endif
 				case 's':
 					silent_mode = true;
 					break;
@@ -1808,13 +1843,12 @@ main(int argc, char **argv)
 				do_advice();
 				exit(1);
 			}
-
 			if (strcmp(argv[optind], "start") == 0)
 				ctl_command = START_COMMAND;
-			else if (strcmp(argv[optind], "stop") == 0)
-				ctl_command = STOP_COMMAND;
 			else if (strcmp(argv[optind], "restart") == 0)
 				ctl_command = RESTART_COMMAND;
+			else if (strcmp(argv[optind], "stop") == 0)
+				ctl_command = STOP_COMMAND;
 			else if (strcmp(argv[optind], "reload") == 0)
 				ctl_command = RELOAD_COMMAND;
 			else if (strcmp(argv[optind], "status") == 0)
@@ -1856,6 +1890,18 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+#ifdef PGXC
+	/* stop command does not need to have coordinator or datanode options */
+	if ((ctl_command == START_COMMAND || ctl_command == RESTART_COMMAND)
+		&& !pgxcCommand)
+	{
+		write_stderr(_("%s: coordinator or datanode option not specified (-S)\n"),
+					progname);
+		do_advice();
+		exit(1);
+	}
+#endif
+
 	/* Note we put any -D switch into the env var above */
 	pg_data = getenv("PGDATA");
 	if (pg_data)
@@ -1912,12 +1958,12 @@ main(int argc, char **argv)
 		case START_COMMAND:
 			do_start();
 			break;
-		case STOP_COMMAND:
-			do_stop();
-			break;
 		case RESTART_COMMAND:
 			do_restart();
 			break;
+		case STOP_COMMAND:
+			do_stop();
+			break;
 		case RELOAD_COMMAND:
 			do_reload();
 			break;
diff --git a/src/gtm/Makefile b/src/gtm/Makefile
new file mode 100644
index 0000000000..51c55e0dd5
--- /dev/null
+++ b/src/gtm/Makefile
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/gtm
+# GTM and GTM proxy
+#
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#-------------------------------------------------------------------------
+
+PGFILEDESC = "gtm - Global Transaction Manager for Postgres-XC"
+subdir = src/gtm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+WANTED_DIRS=common path libpq client main proxy gtm_ctl
+
+all:
+	@for dir in $(WANTED_DIRS); do \
+        $(MAKE) -C $$dir $@ || exit; \
+    done
+
+clobber:
+	@for dir in $(WANTED_DIRS); do \
+        $(MAKE) -C $$dir $@ || exit; \
+    done
+
+clean:
+	@for dir in $(WANTED_DIRS); do \
+        $(MAKE) -C $$dir $@ || exit; \
+    done
+
+distclean: clean
+
+maintainer-clean: distclean
+
+install: all 
+	$(INSTALL_PROGRAM) ./main/gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)'
+	$(INSTALL_PROGRAM) ./gtm_ctl/gtm_ctl$(X) '$(DESTDIR)$(bindir)/gtm_ctl$(X)'
+	$(INSTALL_PROGRAM) ./proxy/gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)'
+
+uninstall:
+	rm -f $(DESTDIR)$(bindir)/gtm$(X)
+	rm -f $(DESTDIR)$(bindir)/gtm_ctl$(X)
+	rm -f $(DESTDIR)$(bindir)/gtm_proxy$(X)
diff --git a/src/gtm/Makefile.global b/src/gtm/Makefile.global
new file mode 100644
index 0000000000..f130bdbd7f
--- /dev/null
+++ b/src/gtm/Makefile.global
@@ -0,0 +1,116 @@
+
+##########################################################################
+#
+# Meta configuration
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+.PHONY: all install install-strip installdirs uninstall clean distclean check installcheck
+.SILENT: installdirs
+
+# make `all' the default target
+all:
+
+# Delete target files if the command fails after it has
+# started to update the file.
+.DELETE_ON_ERROR:
+
+# PostgreSQL version number
+VERSION = 1.0Beta
+MAJORVERSION = 1.0
+
+top_srcdir=$(top_build_dir)
+
+enable_shared   = yes
+
+##########################################################################
+#
+# Programs and flags
+
+# Compilers
+
+CPP = gcc -E
+CPPFLAGS =  -D_GNU_SOURCE 
+
+override CPPFLAGS := -I$(top_srcdir)/include $(CPPFLAGS)
+
+CC = gcc
+GCC = yes
+SUN_STUDIO_CC = no
+CFLAGS = $(DEBUGFLAGS) -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -fno-strict-aliasing -fwrapv
+
+# Kind-of compilers
+
+BISON = bison
+BISONFLAGS =  $(YFLAGS)
+FLEX = /usr/bin/flex
+FLEXFLAGS =  $(LFLAGS)
+DTRACE = 
+DTRACEFLAGS = 
+ZIC = 
+
+# Linking
+
+AR = ar
+DLLTOOL = 
+DLLWRAP = 
+LIBS = -lz -lreadline -lcrypt -ldl -lm -lpthread
+LDAP_LIBS_FE = 
+LDAP_LIBS_BE = 
+OSSP_UUID_LIBS = 
+LD = /usr/bin/ld
+with_gnu_ld = yes
+ld_R_works = 
+LDFLAGS =   -Wl,--as-needed
+LDFLAGS_SL = 
+LDREL = -r
+LDOUT = -o
+RANLIB = ranlib
+WINDRES = 
+X = 
+
+# Perl 
+
+# quoted for pathname with spaces
+PERL			= "/usr/bin/perl"
+perl_archlibexp		= 
+perl_privlibexp		= 
+perl_useshrplib		= 
+perl_embed_ldflags	= 
+
+# Miscellaneous
+
+AWK	= gawk
+LN_S	= ln -s
+MSGFMT  = 
+MSGMERGE = 
+PYTHON	= 
+TAR	= /bin/tar
+XGETTEXT = 
+
+GZIP	= gzip
+BZIP2	= bzip2
+
+PL_TESTDB = pl_regression
+CONTRIB_TESTDB = contrib_regression
+
+
+
+##########################################################################
+#
+# Additional platform-specific settings
+#
+
+# Name of the "template"
+PORTNAME= linux
+
+
+# Set up rpath if enabled.  By default it will point to our libdir,
+# but individual Makefiles can force other rpath paths if needed.
+rpathdir = $(libdir)
+
+ifeq ($(enable_rpath), yes)
+LDFLAGS += $(rpath)
+endif
+
+include $(top_build_dir)/gtm/Makefile.port
+
diff --git a/src/gtm/Makefile.port b/src/gtm/Makefile.port
new file mode 100644
index 0000000000..611c8b7766
--- /dev/null
+++ b/src/gtm/Makefile.port
@@ -0,0 +1,16 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+AROPT = crs
+export_dynamic = -Wl,-E
+rpath = -Wl,-rpath,'$(rpathdir)'
+allow_nonpic_in_shlib = yes
+DLSUFFIX = .so
+
+ifeq "$(findstring sparc,$(host_cpu))" "sparc"
+CFLAGS_SL = -fPIC
+else
+CFLAGS_SL = -fpic
+endif
+
+%.so: %.o
+	$(CC) $(CFLAGS) -shared -o $@ $<
diff --git a/src/gtm/Makefile.shlib b/src/gtm/Makefile.shlib
new file mode 100644
index 0000000000..83aca3896b
--- /dev/null
+++ b/src/gtm/Makefile.shlib
@@ -0,0 +1,556 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.shlib
+#    Common rules for building shared libraries
+#
+# Copyright (c) 1998, Regents of the University of California
+# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+#
+# IDENTIFICATION
+#    $PostgreSQL: pgsql/src/Makefile.shlib,v 1.119 2008/12/11 07:34:07 petere Exp $
+#
+#-------------------------------------------------------------------------
+
+# This file should be included by any Postgres module Makefile that
+# wants to build a shared library (if possible for the current
+# platform). A static library is also built from the same object
+# files. Only one library can be built per makefile.
+#
+# Before including this file, the module Makefile must define these
+# variables:
+#
+# NAME                  Name of library to build (no suffix nor "lib" prefix)
+# OBJS                  List of object files to include in library
+# SHLIB_LINK            If shared library relies on other libraries,
+#                       additional stuff to put in its link command
+# SHLIB_EXPORTS         (optional) Name of file containing list of symbols to
+#                       export
+#
+# When building a shared library, the following version information
+# must also be set.  It should be omitted when building a dynamically
+# loadable module.
+#
+# SO_MAJOR_VERSION      Major version number to use for shared library
+# SO_MINOR_VERSION      Minor version number to use for shared library
+# (If you want a patchlevel, include it in SO_MINOR_VERSION, e.g., "6.2".)
+#
+# Optional flags when building DLL's (only applicable to win32 and cygwin
+# platforms).
+# DLLTOOL_DEFFLAGS      Additional flags when creating the dll .def file
+# DLLTOOL_LIBFLAGS      Additional flags when creating the lib<module>.a file
+# DLLWRAP_FLAGS         Additional flags to dllwrap
+#
+# The module Makefile must also include
+# $(top_builddir)/src/Makefile.global before including this file.
+# (Makefile.global sets PORTNAME and other needed symbols.)
+#
+# This makefile provides the following (phony) targets:
+#
+# all-lib               build the static and shared (if applicable) libraries
+# install-lib           install the libraries into $(libdir)
+# installdirs-lib       create installation directory $(libdir)
+# uninstall-lib         remove the libraries from $(libdir)
+# clean-lib             delete the static and shared libraries from the build dir
+# maintainer-clean-lib  delete .def files built for win32
+#
+# Since `all-lib' is the first rule in this file you probably want to
+# have the `all' target before including this file. In the most simple
+# case it would look like this:
+#
+#     all: all-lib
+#
+# Similarly, the install rule might look like
+#
+#     install: install-lib
+#
+# plus any additional things you want to install. Et cetera.
+#
+# Got that?  Look at src/interfaces/libpq/Makefile for an example.
+#
+# While the linker allows creation of most shared libraries,
+# -Bsymbolic requires resolution of all symbols, making the
+# compiler a better choice for shared library creation on ELF platforms.
+# With the linker, -Bsymbolic requires the crt1.o startup object file.
+# bjm 2001-02-10
+
+
+COMPILER = $(CC) $(CFLAGS)
+LINK.static = $(AR) $(AROPT)
+
+
+
+# Insert -L from LDFLAGS after any -L already present in SHLIB_LINK
+SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) $(filter -L%, $(LDFLAGS)) $(filter-out -L%, $(SHLIB_LINK))
+
+# Need a -L-free version of LDFLAGS to use in combination with SHLIB_LINK
+LDFLAGS_NO_L = $(filter-out -L%, $(LDFLAGS))
+
+ifdef SO_MAJOR_VERSION
+# Default library naming convention used by the majority of platforms
+ifeq ($(enable_shared), yes)
+shlib		= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+shlib_major	= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+shlib_bare	= lib$(NAME)$(DLSUFFIX)
+endif
+# Testing the soname variable is a reliable way to determine whether a
+# linkable library is being built.
+soname		= $(shlib_major)
+else
+# Naming convention for dynamically loadable modules
+ifeq ($(enable_shared), yes)
+shlib		= $(NAME)$(DLSUFFIX)
+endif
+endif
+stlib		= lib$(NAME).a
+
+ifndef soname
+# additional flags for backend modules
+SHLIB_LINK := $(BE_DLLLIBS) $(SHLIB_LINK)
+endif
+
+# For each platform we support shared libraries on, set shlib to the
+# name of the library (if default above is not right), set
+# LINK.shared to the command to link the library,
+# and adjust SHLIB_LINK if necessary.
+
+# Try to keep the sections in some kind of order, folks...
+
+override CFLAGS += $(CFLAGS_SL)
+ifdef SO_MAJOR_VERSION
+# libraries ought to use this to refer to versioned gettext domain names
+override CPPFLAGS += -DSO_MAJOR_VERSION=$(SO_MAJOR_VERSION)
+endif
+
+ifeq ($(PORTNAME), aix)
+  ifdef SO_MAJOR_VERSION
+    shlib		= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+  endif
+  haslibarule   = yes
+  exports_file		= lib$(NAME).exp
+endif
+
+ifeq ($(PORTNAME), darwin)
+  ifdef soname
+    # linkable library
+    DLSUFFIX		= .dylib
+    ifneq ($(SO_MAJOR_VERSION), 0)
+      version_link	= -compatibility_version $(SO_MAJOR_VERSION) -current_version $(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+    endif
+    LINK.shared		= $(COMPILER) -dynamiclib -install_name $(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) $(version_link) $(exported_symbols_list) -multiply_defined suppress
+    shlib		= lib$(NAME).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)$(DLSUFFIX)
+    shlib_major		= lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)
+  else
+    # loadable module
+    DLSUFFIX		= .so
+    LINK.shared		= $(COMPILER) -bundle -multiply_defined suppress
+  endif
+  BUILD.exports		= $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@
+  exports_file		= $(SHLIB_EXPORTS:%.txt=%.list)
+  ifneq (,$(exports_file))
+    exported_symbols_list = -exported_symbols_list $(exports_file)
+  endif
+endif
+
+ifeq ($(PORTNAME), openbsd)
+  ifdef ELF_SYSTEM
+    LINK.shared		= $(COMPILER) -shared
+    ifdef soname
+      LINK.shared	+= -Wl,-x,-soname,$(soname)
+    endif
+    SHLIB_LINK		+= -lc
+  else
+    LINK.shared		= $(LD) -x -Bshareable -Bforcearchive
+  endif
+endif
+
+ifeq ($(PORTNAME), bsdi)
+  ifeq ($(DLSUFFIX), .so)
+    LINK.shared		= $(COMPILER) -shared
+    ifdef soname
+      LINK.shared	+= -Wl,-x,-soname,$(soname)
+    endif
+    SHLIB_LINK		+= -lc
+  endif
+  ifeq ($(DLSUFFIX), .o)
+    LINK.shared		= shlicc -O $(LDREL)
+  endif
+endif
+
+ifeq ($(PORTNAME), freebsd)
+  ifdef ELF_SYSTEM
+    ifdef SO_MAJOR_VERSION
+      shlib		= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+    endif
+    LINK.shared		= $(COMPILER) -shared
+    ifdef soname
+      LINK.shared	+= -Wl,-x,-soname,$(soname)
+    endif
+  else
+    ifdef SO_MAJOR_VERSION
+      shlib		= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+    endif
+    LINK.shared		= $(LD) -x -Bshareable -Bforcearchive
+  endif
+endif
+
+ifeq ($(PORTNAME), netbsd)
+  ifdef ELF_SYSTEM
+    LINK.shared		= $(COMPILER) -shared
+    ifdef soname
+      LINK.shared	+= -Wl,-x,-soname,$(soname)
+    endif
+  else
+    LINK.shared		= $(LD) -x -Bshareable -Bforcearchive
+  endif
+endif
+
+ifeq ($(PORTNAME), hpux)
+  ifdef SO_MAJOR_VERSION
+    shlib			= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+  endif
+  ifeq ($(with_gnu_ld), yes)
+    LINK.shared		= $(CC) $(LDFLAGS_NO_L) -shared
+    ifdef soname
+      LINK.shared	+= -Wl,-h -Wl,$(soname)
+    endif
+  else
+    # can't use the CC-syntax rpath pattern here
+    rpath =
+    LINK.shared		= $(LD) -b
+    ifdef soname
+      LINK.shared	+= +h $(soname)
+    endif
+    ifeq ($(enable_rpath), yes)
+      LINK.shared	+= +b '$(rpathdir)'
+    endif
+    # On HPUX platforms, gcc is usually configured to search for libraries
+    # in /usr/local/lib, but ld won't do so.  Add an explicit -L switch so
+    # ld can find the same libraries gcc does.  Make sure it goes after any
+    # -L switches provided explicitly.
+    ifeq ($(GCC), yes)
+      SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) -L/usr/local/lib $(filter-out -L%, $(SHLIB_LINK))
+    endif
+  endif
+  # do this last so above filtering doesn't pull out -L switches in LDFLAGS
+  ifeq ($(GCC), yes)
+    SHLIB_LINK		+= `$(CC) $(LDFLAGS) -print-libgcc-file-name`
+  endif
+endif
+
+ifeq ($(PORTNAME), irix)
+  ifdef SO_MAJOR_VERSION
+    shlib		= lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION)
+  endif
+  LINK.shared		= $(COMPILER) -shared
+  ifdef soname
+    LINK.shared		+= -Wl,-set_version,sgi$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)
+  endif
+endif
+
+ifeq ($(PORTNAME), linux)
+  LINK.shared		= $(COMPILER) -shared
+  ifdef soname
+    LINK.shared		+= -Wl,-soname,$(soname)
+  endif
+  BUILD.exports		= ( echo '{ global:'; $(AWK) '/^[^\#]/ {printf "%s;\n",$$1}' $<; echo ' local: *; };' ) >$@
+  exports_file		= $(SHLIB_EXPORTS:%.txt=%.list)
+  ifneq (,$(exports_file))
+    LINK.shared		+= -Wl,--version-script=$(exports_file)
+  endif
+endif
+
+ifeq ($(PORTNAME), solaris)
+  ifeq ($(GCC), yes)
+    LINK.shared		= $(COMPILER) -shared
+  else
+    LINK.shared		= $(COMPILER) -G
+  endif
+  ifdef soname
+    ifeq ($(with_gnu_ld), yes)
+      LINK.shared	+= -Wl,-soname,$(soname)
+    else
+      LINK.shared	+= -h $(soname)
+    endif
+  endif
+endif
+
+ifeq ($(PORTNAME), sunos4)
+  LINK.shared		= $(LD) -assert pure-text -Bdynamic
+endif
+ 
+ifeq ($(PORTNAME), osf)
+  LINK.shared		= $(LD) -shared -expect_unresolved '*'
+endif
+
+ifeq ($(PORTNAME), sco)
+  ifeq ($(GCC), yes)
+    LINK.shared		= $(CC) -shared
+  else
+    LINK.shared		= $(CC) -G
+    endif
+  LINK.shared		+= -Wl,-z,text
+  ifdef soname
+    LINK.shared		+= -Wl,-h,$(soname)
+  endif
+endif
+
+ifeq ($(PORTNAME), svr4)
+  LINK.shared		= $(LD) -G
+endif
+
+ifeq ($(PORTNAME), univel)
+  LINK.shared		= $(LD) -G -z text
+endif
+
+ifeq ($(PORTNAME), unixware)
+  ifeq ($(GCC), yes)
+    LINK.shared		= $(CC) -shared
+  else
+    LINK.shared		= $(CC) -G
+  endif
+  LINK.shared		+= -Wl,-z,text
+  ifdef soname
+    LINK.shared		+= -Wl,-h,$(soname)
+  endif
+endif
+
+ifeq ($(PORTNAME), cygwin)
+  ifdef SO_MAJOR_VERSION
+    shlib		= cyg$(NAME)$(DLSUFFIX)
+  endif
+  haslibarule   = yes
+endif
+
+ifeq ($(PORTNAME), win32)
+  ifdef SO_MAJOR_VERSION
+    shlib		= lib$(NAME)$(DLSUFFIX)
+  endif
+  haslibarule   = yes
+endif
+
+ifeq ($(enable_rpath), yes)
+SHLIB_LINK += $(rpath)
+endif
+
+
+
+##
+## BUILD
+##
+
+.PHONY: all-lib all-static-lib all-shared-lib
+
+all-lib: all-shared-lib
+ifdef soname
+# no static library when building a dynamically loadable module
+all-lib: all-static-lib
+endif
+
+all-static-lib: $(stlib)
+
+all-shared-lib: $(shlib)
+
+ifndef haslibarule
+$(stlib): $(OBJS)
+	$(LINK.static) $@ $^
+	$(RANLIB) $@
+endif #haslibarule
+
+ifeq ($(enable_shared), yes)
+
+ifeq (,$(filter cygwin win32,$(PORTNAME)))
+ifneq ($(PORTNAME), aix)
+
+# Normal case
+$(shlib): $(OBJS)
+	$(LINK.shared) $(LDFLAGS_SL) $(OBJS) $(SHLIB_LINK) -o $@
+ifdef shlib_major
+# If we're using major and minor versions, then make a symlink to major-version-only.
+ifneq ($(shlib), $(shlib_major))
+	rm -f $(shlib_major)
+	$(LN_S) $(shlib) $(shlib_major)
+endif
+# Make sure we have a link to a name without any version numbers
+ifneq ($(shlib), $(shlib_bare))
+	rm -f $(shlib_bare)
+	$(LN_S) $(shlib) $(shlib_bare)
+endif
+endif # shlib_major
+
+# Where possible, restrict the symbols exported by the library to just the
+# official list, so as to avoid unintentional ABI changes.  On recent Darwin
+# this also quiets multiply-defined-symbol warnings in programs that use
+# libpgport along with libpq.
+ifneq (,$(SHLIB_EXPORTS))
+ifdef BUILD.exports
+$(shlib): $(SHLIB_EXPORTS:%.txt=%.list)
+
+$(SHLIB_EXPORTS:%.txt=%.list): %.list: %.txt
+	$(BUILD.exports)
+endif
+endif
+
+else # PORTNAME == aix
+
+# AIX case
+$(shlib) $(stlib): $(OBJS)
+	$(LINK.static) $(stlib) $^
+	$(RANLIB) $(stlib)
+	$(MKLDEXPORT) $(stlib) >$(exports_file)
+	$(COMPILER) $(LDFLAGS_NO_L) $(LDFLAGS_SL) -o $(shlib) $(stlib) -Wl,-bE:$(exports_file) $(SHLIB_LINK)
+	rm -f $(stlib)
+	$(AR) $(AROPT) $(stlib) $(shlib)
+
+endif # PORTNAME == aix
+
+else # PORTNAME == cygwin || PORTNAME == win32
+
+# Cygwin or Win32 case
+
+# If SHLIB_EXPORTS is set, the rules below will build a .def file from
+# that.  Else we build a temporary one here.
+ifeq (,$(SHLIB_EXPORTS))
+DLL_DEFFILE = lib$(NAME)dll.def
+exports_file = $(DLL_DEFFILE)
+
+$(exports_file): $(OBJS)
+	$(DLLTOOL) --export-all $(DLLTOOL_DEFFLAGS) --output-def $@ $^
+else
+DLL_DEFFILE = $(srcdir)/lib$(NAME)dll.def
+endif
+
+$(shlib): $(OBJS) $(DLL_DEFFILE)
+	$(DLLWRAP) $(LDFLAGS_SL) -o $@ --dllname $(shlib) $(DLLWRAP_FLAGS) --def $(DLL_DEFFILE) $(OBJS) $(SHLIB_LINK)
+
+$(stlib): $(shlib) $(DLL_DEFFILE)
+	$(DLLTOOL) --dllname $(shlib) $(DLLTOOL_LIBFLAGS) --def $(DLL_DEFFILE) --output-lib $@
+
+endif # PORTNAME == cygwin || PORTNAME == win32
+
+endif # enable_shared
+
+
+# We need several not-quite-identical variants of .DEF files to build
+# DLLs for Windows.  These are made from the single source file
+# exports.txt.  Since we can't assume that Windows boxes will have
+# sed, the .DEF files are always built and included in distribution
+# tarballs.
+
+ifneq (,$(SHLIB_EXPORTS))
+distprep: $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def
+
+UC_NAME = $(shell echo $(NAME) | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+$(srcdir)/lib$(NAME)dll.def: $(SHLIB_EXPORTS)
+	echo '; DEF file for MS VC++' >$@
+	echo 'LIBRARY LIB$(UC_NAME)' >>$@
+	echo 'EXPORTS' >>$@
+	sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/    \1@ \2/' $< >>$@
+
+$(srcdir)/lib$(NAME)ddll.def: $(SHLIB_EXPORTS)
+	echo '; DEF file for MS VC++' >$@
+	echo 'LIBRARY LIB$(UC_NAME)D' >>$@
+	echo 'EXPORTS' >>$@
+	sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/    \1@ \2/' $< >>$@
+
+$(srcdir)/blib$(NAME)dll.def: $(SHLIB_EXPORTS)
+	echo '; DEF file for Borland C++ Builder' >$@
+	echo 'LIBRARY BLIB$(UC_NAME)' >>$@
+	echo 'EXPORTS' >>$@
+	sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/    _\1@ \2/' $< >>$@
+	echo >>$@
+	echo '; Aliases for MS compatible names' >> $@
+	sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/    \1= _\1/' $< | sed 's/ *$$//' >>$@
+endif # SHLIB_EXPORTS
+
+
+##
+## INSTALL
+##
+
+.PHONY: install-lib install-lib-static install-lib-shared installdirs-lib
+install-lib: install-lib-shared
+ifdef soname
+install-lib: install-lib-static
+endif
+
+install-lib-static: $(stlib) installdirs-lib
+	$(INSTALL_STLIB) $< '$(DESTDIR)$(libdir)/$(stlib)'
+ifeq ($(PORTNAME), darwin)
+	cd '$(DESTDIR)$(libdir)' && \
+	ranlib $(stlib)
+endif
+
+ifeq ($(enable_shared), yes)
+install-lib-shared: $(shlib) installdirs-lib
+ifdef soname
+# we don't install $(shlib) on AIX
+# (see http://archives.postgresql.org/message-id/52EF20B2E3209443BC37736D00C3C1380A6E79FE@EXADV1.host.magwien.gv.at)
+ifneq ($(PORTNAME), aix)
+	$(INSTALL_SHLIB) $< '$(DESTDIR)$(libdir)/$(shlib)'
+ifneq ($(PORTNAME), cygwin)
+ifneq ($(PORTNAME), win32)
+ifneq ($(shlib), $(shlib_major))
+	cd '$(DESTDIR)$(libdir)' && \
+	rm -f $(shlib_major) && \
+	$(LN_S) $(shlib) $(shlib_major)
+endif
+ifneq ($(shlib), $(shlib_bare))
+	cd '$(DESTDIR)$(libdir)' && \
+	rm -f $(shlib_bare) && \
+	$(LN_S) $(shlib) $(shlib_bare)
+endif
+endif # not win32
+endif # not cygwin
+endif # not aix
+else # no soname
+	$(INSTALL_SHLIB) $< '$(DESTDIR)$(pkglibdir)/$(shlib)'
+endif
+else # not enable_shared
+ifndef soname
+install-lib-shared:
+	@echo "*****"; \
+	 echo "* Module $(NAME) was not installed due to lack of shared library support."; \
+	 echo "*****"
+endif
+endif # enable_shared
+
+
+installdirs-lib:
+ifdef soname
+	$(mkinstalldirs) '$(DESTDIR)$(libdir)'
+else
+	$(mkinstalldirs) '$(DESTDIR)$(pkglibdir)'
+endif
+
+
+##
+## UNINSTALL
+##
+
+.PHONY: uninstall-lib
+uninstall-lib:
+ifdef soname
+	rm -f '$(DESTDIR)$(libdir)/$(stlib)'
+ifeq ($(enable_shared), yes)
+	rm -f '$(DESTDIR)$(libdir)/$(shlib_bare)' \
+	  '$(DESTDIR)$(libdir)/$(shlib_major)' \
+	  '$(DESTDIR)$(libdir)/$(shlib)'
+endif # enable_shared
+else # no soname
+	rm -f '$(DESTDIR)$(pkglibdir)/$(shlib)'
+endif # no soname
+
+
+##
+## CLEAN
+##
+
+.PHONY: clean-lib
+clean-lib:
+	rm -f $(shlib) $(shlib_bare) $(shlib_major) $(stlib) $(exports_file)
+
+ifneq (,$(SHLIB_EXPORTS))
+maintainer-clean-lib:
+	rm -f $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def
+endif
diff --git a/src/gtm/README b/src/gtm/README
new file mode 100644
index 0000000000..77cff3695b
--- /dev/null
+++ b/src/gtm/README
@@ -0,0 +1,61 @@
+
+Global Transaction Manager (GTM)
+--------------------------------
+
+1. Source code layout:
+----------------------
+
+The server side code is located in the "include", "common" and
+"main" directories. The "include" directory hosts all the header
+files some of which are also shared by the client.
+
+The "common" directory contains the infrastructure pieces for the
+server such as error reporting, memory management, locking etc.
+Most of the server side logic including message processing,
+transaction management, thread and connection management is hosted
+in the "main" directory.
+
+The client side code is put in the "client" directory including all
+client side infrastructure and test programs.
+
+
+2. Building GTM Server and Clients:
+-----------------------------------
+
+Go to the top level directory (where this README is located) and run
+the make command to build the sources.
+
+$ make
+
+This would build the GTM server in the "main" directory and client
+libraries in the "client" directory.
+
+You may want to change the following two defines in main/main.c 
+
+#define GTM_DEFAULT_HOSTNAME    "localhost"
+#define GTM_DEFAULT_PORT        6666
+
+
+3. Running the GTM Server:
+---------------------------
+
+You can run the GTM server by running the following command from the
+top level directory.
+
+$ ./main/gtm
+
+The server will start listening on port 6666 for incoming connections.
+
+
+4. Building test clients:
+-------------------------
+
+Go to the "client/test" directory and run make to build the test clients.
+
+$ cd client/test
+$ make
+
+This would build various test clients, statically linking to the libgtmclient.a
+library in the client directory. You may need to change the connect string
+appropriately connect to the GTM server.
+
diff --git a/src/gtm/client/Makefile b/src/gtm/client/Makefile
new file mode 100644
index 0000000000..216adf2207
--- /dev/null
+++ b/src/gtm/client/Makefile
@@ -0,0 +1,26 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+
+NAME=gtmclient
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=fe-misc.o fe-connect.o pqexpbuffer.o ip.o strlcpy.o gtm_client.o fe-protocol.o
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+	rm -f $(OBJS)
+	rm -f libgtmclient.a libgtmclient.so libgtmclient.so.1 libgtmclient.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
new file mode 100644
index 0000000000..29d8fe4cc5
--- /dev/null
+++ b/src/gtm/client/fe-connect.c
@@ -0,0 +1,1287 @@
+/*-------------------------------------------------------------------------
+ *
+ * fe-connect.c
+ *	  functions related to setting up a connection to the backend
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/interfaces/libpq/fe-connect.c,v 1.371 2008/12/15 10:28:21 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "gtm/gtm_ip.h"
+#include "gtm/gtm_msg.h"
+
+/* fall back options if they are not specified by arguments or defined
+   by environment variables */
+#define DefaultHost		"localhost"
+
+/* ----------
+ * Definition of the conninfo parameters and their fallback resources.
+ *
+ * GTMPQconninfoOptions[] is a constant static array that we use to initialize
+ * a dynamically allocated working copy.  All the "val" fields in
+ * GTMPQconninfoOptions[] *must* be NULL.	In a working copy, non-null "val"
+ * fields point to malloc'd strings that should be freed when the working
+ * array is freed (see GTMPQconninfoFree).
+ * ----------
+ */
+static const GTMPQconninfoOption GTMPQconninfoOptions[] = {
+	{"connect_timeout", NULL},
+	{"host", NULL},
+	{"hostaddr", NULL},
+	{"port", NULL},
+	{"coordinator_id", NULL},
+	{"proxy", NULL},
+	/* Terminating entry --- MUST BE LAST */
+	{NULL, NULL}
+};
+
+static bool connectOptions1(GTM_Conn *conn, const char *conninfo);
+static int	connectGTMStart(GTM_Conn *conn);
+static int	connectGTMComplete(GTM_Conn *conn);
+static GTM_Conn *makeEmptyGTM_Conn(void);
+static void freeGTM_Conn(GTM_Conn *conn);
+static void closeGTM_Conn(GTM_Conn *conn);
+static GTMPQconninfoOption *conninfo_parse(const char *conninfo,
+			   PQExpBuffer errorMessage, bool use_defaults);
+static char *conninfo_getval(GTMPQconninfoOption *connOptions,
+				const char *keyword);
+
+static int pqPacketSend(GTM_Conn *conn, char packet_type,
+			 const void *buf, size_t buf_len);
+
+GTM_Conn *
+PQconnectGTM(const char *conninfo)
+{
+	GTM_Conn	   *conn = PQconnectGTMStart(conninfo);
+
+	if (conn && conn->status != CONNECTION_BAD)
+		(void) connectGTMComplete(conn);
+
+	return conn;
+}
+
+/*
+ *		PQconnectGTMStart
+ *
+ * Returns a GTM_Conn*.  If NULL is returned, a malloc error has occurred, and
+ * you should not attempt to proceed with this connection.	If the status
+ * field of the connection returned is CONNECTION_BAD, an error has
+ * occurred. In this case you should call GTMPQfinish on the result, (perhaps
+ * inspecting the error message first).  Other fields of the structure may not
+ * be valid if that occurs.  If the status field is not CONNECTION_BAD, then
+ * this stage has succeeded - call GTMPQconnectPoll, using select(2) to see when
+ * this is necessary.
+ *
+ * See GTMPQconnectPoll for more info.
+ */
+GTM_Conn *
+PQconnectGTMStart(const char *conninfo)
+{
+	GTM_Conn	   *conn;
+
+	/*
+	 * Allocate memory for the conn structure
+	 */
+	conn = makeEmptyGTM_Conn();
+	if (conn == NULL)
+		return NULL;
+
+	/*
+	 * Parse the conninfo string
+	 */
+	if (!connectOptions1(conn, conninfo))
+		return conn;
+
+	/*
+	 * Connect to the database
+	 */
+	if (!connectGTMStart(conn))
+	{
+		/* Just in case we failed to set it in connectGTMStart */
+		conn->status = CONNECTION_BAD;
+	}
+
+	return conn;
+}
+
+/*
+ *		connectOptions1
+ *
+ * Internal subroutine to set up connection parameters given an already-
+ * created GTM_Conn and a conninfo string. 
+ *
+ * Returns true if OK, false if trouble (in which case errorMessage is set
+ * and so is conn->status).
+ */
+static bool
+connectOptions1(GTM_Conn *conn, const char *conninfo)
+{
+	GTMPQconninfoOption *connOptions;
+	char	   *tmp;
+
+	/*
+	 * Parse the conninfo string
+	 */
+	connOptions = conninfo_parse(conninfo, &conn->errorMessage, true);
+	if (connOptions == NULL)
+	{
+		conn->status = CONNECTION_BAD;
+		/* errorMessage is already set */
+		return false;
+	}
+
+	/*
+	 * Move option values into conn structure
+	 *
+	 * XXX: probably worth checking strdup() return value here...
+	 */
+	tmp = conninfo_getval(connOptions, "hostaddr");
+	conn->pghostaddr = tmp ? strdup(tmp) : NULL;
+	tmp = conninfo_getval(connOptions, "host");
+	conn->pghost = tmp ? strdup(tmp) : NULL;
+	tmp = conninfo_getval(connOptions, "port");
+	conn->pgport = tmp ? strdup(tmp) : NULL;
+	tmp = conninfo_getval(connOptions, "connect_timeout");
+	conn->connect_timeout = tmp ? strdup(tmp) : NULL;
+	tmp = conninfo_getval(connOptions, "coordinator_id");
+	conn->coordinator_id = tmp ? strdup(tmp) : NULL;
+	tmp = conninfo_getval(connOptions, "proxy");
+	conn->is_proxy = tmp ? atoi(tmp) : 0;
+
+	/*
+	 * Free the option info - all is in conn now
+	 */
+	GTMPQconninfoFree(connOptions);
+
+	return true;
+}
+
+
+/* ----------
+ * connectNoDelay -
+ * Sets the TCP_NODELAY socket option.
+ * Returns 1 if successful, 0 if not.
+ * ----------
+ */
+static int
+connectNoDelay(GTM_Conn *conn)
+{
+#ifdef	TCP_NODELAY
+	int			on = 1;
+
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_NODELAY,
+				   (char *) &on,
+				   sizeof(on)) < 0)
+	{
+		appendGTMPQExpBuffer(&conn->errorMessage,
+			"could not set socket to TCP no delay mode: \n");
+		return 0;
+	}
+#endif
+
+	return 1;
+}
+
+
+/* ----------
+ * connectFailureMessage -
+ * create a friendly error message on connection failure.
+ * ----------
+ */
+static void
+connectFailureMessage(GTM_Conn *conn, int errorno)
+{
+	{
+		appendGTMPQExpBuffer(&conn->errorMessage,
+						  "could not connect to server: \n"
+					 "\tIs the server running on host \"%s\" and accepting\n"
+										"\tTCP/IP connections on port %s?\n",
+						  conn->pghostaddr
+						  ? conn->pghostaddr
+						  : (conn->pghost
+							 ? conn->pghost
+							 : "???"),
+						  conn->pgport);
+	}
+}
+
+
+/* ----------
+ * connectGTMStart -
+ *		Begin the process of making a connection to the backend.
+ *
+ * Returns 1 if successful, 0 if not.
+ * ----------
+ */
+static int
+connectGTMStart(GTM_Conn *conn)
+{
+	int			portnum;
+	char		portstr[128];
+	struct addrinfo *addrs = NULL;
+	struct addrinfo hint;
+	const char *node;
+	int			ret;
+
+	if (!conn)
+		return 0;
+
+	/* Ensure our buffers are empty */
+	conn->inStart = conn->inCursor = conn->inEnd = 0;
+	conn->outCount = 0;
+
+	/*
+	 * Determine the parameters to pass to gtm_getaddrinfo_all.
+	 */
+
+	/* Initialize hint structure */
+	MemSet(&hint, 0, sizeof(hint));
+	hint.ai_socktype = SOCK_STREAM;
+	hint.ai_family = AF_UNSPEC;
+
+	/* Set up port number as a string */
+	if (conn->pgport != NULL && conn->pgport[0] != '\0')
+		portnum = atoi(conn->pgport);
+	snprintf(portstr, sizeof(portstr), "%d", portnum);
+
+	if (conn->pghostaddr != NULL && conn->pghostaddr[0] != '\0')
+	{
+		/* Using pghostaddr avoids a hostname lookup */
+		node = conn->pghostaddr;
+		hint.ai_family = AF_UNSPEC;
+		hint.ai_flags = AI_NUMERICHOST;
+	}
+	else if (conn->pghost != NULL && conn->pghost[0] != '\0')
+	{
+		/* Using pghost, so we have to look-up the hostname */
+		node = conn->pghost;
+		hint.ai_family = AF_UNSPEC;
+	}
+	else
+	{
+		/* Without Unix sockets, default to localhost instead */
+		node = "localhost";
+		hint.ai_family = AF_UNSPEC;
+	}
+
+	/* Use gtm_getaddrinfo_all() to resolve the address */
+	ret = gtm_getaddrinfo_all(node, portstr, &hint, &addrs);
+	if (ret || !addrs)
+	{
+		if (node)
+			appendGTMPQExpBuffer(&conn->errorMessage,
+							  "could not translate host name \"%s\" to address: %s\n",
+							  node, gai_strerror(ret));
+		else
+			appendGTMPQExpBuffer(&conn->errorMessage,
+							  "could not translate Unix-domain socket path \"%s\" to address: %s\n",
+							  portstr, gai_strerror(ret));
+		if (addrs)
+			gtm_freeaddrinfo_all(hint.ai_family, addrs);
+		goto connect_errReturn;
+	}
+
+	/*
+	 * Set up to try to connect, with protocol 3.0 as the first attempt.
+	 */
+	conn->addrlist = addrs;
+	conn->addr_cur = addrs;
+	conn->addrlist_family = hint.ai_family;
+	conn->status = CONNECTION_NEEDED;
+
+	/*
+	 * The code for processing CONNECTION_NEEDED state is in GTMPQconnectPoll(),
+	 * so that it can easily be re-executed if needed again during the
+	 * asynchronous startup process.  However, we must run it once here,
+	 * because callers expect a success return from this routine to mean that
+	 * we are in PGRES_POLLING_WRITING connection state.
+	 */
+	if (GTMPQconnectPoll(conn) == PGRES_POLLING_WRITING)
+		return 1;
+
+connect_errReturn:
+	if (conn->sock >= 0)
+	{
+		close(conn->sock);
+		conn->sock = -1;
+	}
+	conn->status = CONNECTION_BAD;
+	return 0;
+}
+
+
+/*
+ *		connectGTMComplete
+ *
+ * Block and complete a connection.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+connectGTMComplete(GTM_Conn *conn)
+{
+	GTMClientPollingStatusType flag = PGRES_POLLING_WRITING;
+	time_t		finish_time = ((time_t) -1);
+
+	if (conn == NULL || conn->status == CONNECTION_BAD)
+		return 0;
+
+	/*
+	 * Set up a time limit, if connect_timeout isn't zero.
+	 */
+	if (conn->connect_timeout != NULL)
+	{
+		int			timeout = atoi(conn->connect_timeout);
+
+		if (timeout > 0)
+		{
+			/*
+			 * Rounding could cause connection to fail; need at least 2 secs
+			 */
+			if (timeout < 2)
+				timeout = 2;
+			/* calculate the finish time based on start + timeout */
+			finish_time = time(NULL) + timeout;
+		}
+	}
+
+	for (;;)
+	{
+		/*
+		 * Wait, if necessary.	Note that the initial state (just after
+		 * PQconnectGTMStart) is to wait for the socket to select for writing.
+		 */
+		switch (flag)
+		{
+			case PGRES_POLLING_OK:
+				/* Reset stored error messages since we now have a working connection */
+				resetGTMPQExpBuffer(&conn->errorMessage);
+				return 1;		/* success! */
+
+			case PGRES_POLLING_READING:
+				if (gtmpqWaitTimed(1, 0, conn, finish_time))
+				{
+					conn->status = CONNECTION_BAD;
+					return 0;
+				}
+				break;
+
+			case PGRES_POLLING_WRITING:
+				if (gtmpqWaitTimed(0, 1, conn, finish_time))
+				{
+					conn->status = CONNECTION_BAD;
+					return 0;
+				}
+				break;
+
+			default:
+				/* Just in case we failed to set it in GTMPQconnectPoll */
+				conn->status = CONNECTION_BAD;
+				return 0;
+		}
+
+		/*
+		 * Now try to advance the state machine.
+		 */
+		flag = GTMPQconnectPoll(conn);
+	}
+}
+
+/* ----------------
+ *		GTMPQconnectPoll
+ *
+ * Poll an asynchronous connection.
+ *
+ * Returns a GTMClientPollingStatusType.
+ * Before calling this function, use select(2) to determine when data
+ * has arrived..
+ *
+ * You must call GTMPQfinish whether or not this fails.
+ */
+GTMClientPollingStatusType
+GTMPQconnectPoll(GTM_Conn *conn)
+{
+	if (conn == NULL)
+		return PGRES_POLLING_FAILED;
+
+	/* Get the new data */
+	switch (conn->status)
+	{
+			/*
+			 * We really shouldn't have been polled in these two cases, but we
+			 * can handle it.
+			 */
+		case CONNECTION_BAD:
+			return PGRES_POLLING_FAILED;
+		case CONNECTION_OK:
+			return PGRES_POLLING_OK;
+
+			/* These are reading states */
+		case CONNECTION_AWAITING_RESPONSE:
+		case CONNECTION_AUTH_OK:
+			{
+				/* Load waiting data */
+				int			n = gtmpqReadData(conn);
+
+				if (n < 0)
+					goto error_return;
+				if (n == 0)
+					return PGRES_POLLING_READING;
+
+				break;
+			}
+
+			/* These are writing states, so we just proceed. */
+		case CONNECTION_STARTED:
+		case CONNECTION_MADE:
+			break;
+
+		case CONNECTION_NEEDED:
+			break;
+
+		default:
+			appendGTMPQExpBuffer(&conn->errorMessage,
+											"invalid connection state, "
+								 "probably indicative of memory corruption\n"
+											);
+			goto error_return;
+	}
+
+
+keep_going:						/* We will come back to here until there is
+								 * nothing left to do. */
+	switch (conn->status)
+	{
+		case CONNECTION_NEEDED:
+			{
+				/*
+				 * Try to initiate a connection to one of the addresses
+				 * returned by gtm_getaddrinfo_all().  conn->addr_cur is the
+				 * next one to try. We fail when we run out of addresses
+				 * (reporting the error returned for the *last* alternative,
+				 * which may not be what users expect :-().
+				 */
+				while (conn->addr_cur != NULL)
+				{
+					struct addrinfo *addr_cur = conn->addr_cur;
+
+					/* Remember current address for possible error msg */
+					memcpy(&conn->raddr.addr, addr_cur->ai_addr,
+						   addr_cur->ai_addrlen);
+					conn->raddr.salen = addr_cur->ai_addrlen;
+
+					/* Open a socket */
+					conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0);
+					if (conn->sock < 0)
+					{
+						/*
+						 * ignore socket() failure if we have more addresses
+						 * to try
+						 */
+						if (addr_cur->ai_next != NULL)
+						{
+							conn->addr_cur = addr_cur->ai_next;
+							continue;
+						}
+						appendGTMPQExpBuffer(&conn->errorMessage,
+							  "could not create socket: \n");
+						break;
+					}
+
+					/*
+					 * Select socket options: no delay of outgoing data for
+					 * TCP sockets, nonblock mode, close-on-exec. Fail if any
+					 * of this fails.
+					 */
+					if (!IS_AF_UNIX(addr_cur->ai_family))
+					{
+						if (!connectNoDelay(conn))
+						{
+							close(conn->sock);
+							conn->sock = -1;
+							conn->addr_cur = addr_cur->ai_next;
+							continue;
+						}
+					}
+
+					/*
+					 * Start/make connection.  This should not block, since we
+					 * are in nonblock mode.  If it does, well, too bad.
+					 */
+					if (connect(conn->sock, addr_cur->ai_addr,
+								addr_cur->ai_addrlen) < 0)
+					{
+						if (SOCK_ERRNO == EINPROGRESS ||
+							SOCK_ERRNO == EWOULDBLOCK ||
+							SOCK_ERRNO == EINTR ||
+							SOCK_ERRNO == 0)
+						{
+							/*
+							 * This is fine - we're in non-blocking mode, and
+							 * the connection is in progress.  Tell caller to
+							 * wait for write-ready on socket.
+							 */
+							conn->status = CONNECTION_STARTED;
+							return PGRES_POLLING_WRITING;
+						}
+						/* otherwise, trouble */
+					}
+					else
+					{
+						/*
+						 * Hm, we're connected already --- seems the "nonblock
+						 * connection" wasn't.  Advance the state machine and
+						 * go do the next stuff.
+						 */
+						conn->status = CONNECTION_STARTED;
+						goto keep_going;
+					}
+
+					/*
+					 * This connection failed --- set up error report, then
+					 * close socket (do it this way in case close() affects
+					 * the value of errno...).	We will ignore the connect()
+					 * failure and keep going if there are more addresses.
+					 */
+					connectFailureMessage(conn, SOCK_ERRNO);
+					if (conn->sock >= 0)
+					{
+						close(conn->sock);
+						conn->sock = -1;
+					}
+
+					/*
+					 * Try the next address, if any.
+					 */
+					conn->addr_cur = addr_cur->ai_next;
+				}				/* loop over addresses */
+
+				/*
+				 * Ooops, no more addresses.  An appropriate error message is
+				 * already set up, so just set the right status.
+				 */
+				goto error_return;
+			}
+
+		case CONNECTION_STARTED:
+			{
+				int			optval;
+				size_t optlen = sizeof(optval);
+
+				/*
+				 * Write ready, since we've made it here, so the connection
+				 * has been made ... or has failed.
+				 */
+
+				/*
+				 * Now check (using getsockopt) that there is not an error
+				 * state waiting for us on the socket.
+				 */
+
+				if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR,
+							   (char *) &optval, &optlen) == -1)
+				{
+					appendGTMPQExpBuffer(&conn->errorMessage,
+					libpq_gettext("could not get socket error status: \n"));
+					goto error_return;
+				}
+				else if (optval != 0)
+				{
+					/*
+					 * When using a nonblocking connect, we will typically see
+					 * connect failures at this point, so provide a friendly
+					 * error message.
+					 */
+					connectFailureMessage(conn, optval);
+
+					/*
+					 * If more addresses remain, keep trying, just as in the
+					 * case where connect() returned failure immediately.
+					 */
+					if (conn->addr_cur->ai_next != NULL)
+					{
+						if (conn->sock >= 0)
+						{
+							close(conn->sock);
+							conn->sock = -1;
+						}
+						conn->addr_cur = conn->addr_cur->ai_next;
+						conn->status = CONNECTION_NEEDED;
+						goto keep_going;
+					}
+					goto error_return;
+				}
+
+				/* Fill in the client address */
+				conn->laddr.salen = sizeof(conn->laddr.addr);
+				if (getsockname(conn->sock,
+								(struct sockaddr *) & conn->laddr.addr,
+								&conn->laddr.salen) < 0)
+				{
+					appendGTMPQExpBuffer(&conn->errorMessage,
+									  "could not get client address from socket:\n");
+					goto error_return;
+				}
+
+				/*
+				 * Make sure we can write before advancing to next step.
+				 */
+				conn->status = CONNECTION_MADE;
+				return PGRES_POLLING_WRITING;
+			}
+
+		case CONNECTION_MADE:
+			{
+				GTM_StartupPacket sp;
+
+				/*
+				 * Build a startup packet. We tell the GTM server/proxy our
+				 * coordinator ID and whether we are a proxy or not.
+				 *
+				 * When the connection is made from the proxy, we let the GTM
+				 * server know about it so that some special headers are
+				 * handled correctly by the server.
+				 */
+				sp.sp_cid = atoi(conn->coordinator_id);
+				sp.sp_isproxy = conn->is_proxy;
+
+				/*
+				 * Send the startup packet.
+				 *
+				 * Theoretically, this could block, but it really shouldn't
+				 * since we only got here if the socket is write-ready.
+				 */
+				if (pqPacketSend(conn, 'A', &sp,
+								 sizeof (GTM_StartupPacket)) != STATUS_OK)
+				{
+					appendGTMPQExpBuffer(&conn->errorMessage,
+						"could not send startup packet: \n");
+					goto error_return;
+				}
+
+				conn->status = CONNECTION_AWAITING_RESPONSE;
+				return PGRES_POLLING_READING;
+			}
+
+			/*
+			 * Handle authentication exchange: wait for postmaster messages
+			 * and respond as necessary.
+			 */
+		case CONNECTION_AWAITING_RESPONSE:
+			{
+				char		beresp;
+
+				/*
+				 * Scan the message from current point (note that if we find
+				 * the message is incomplete, we will return without advancing
+				 * inStart, and resume here next time).
+				 */
+				conn->inCursor = conn->inStart;
+
+				/* Read type byte */
+				if (gtmpqGetc(&beresp, conn))
+				{
+					/* We'll come back when there is more data */
+					return PGRES_POLLING_READING;
+				}
+
+				/*
+				 * Validate message type: we expect only an authentication
+				 * request or an error here.  Anything else probably means
+				 * it's not GTM on the other end at all.
+				 */
+				if (!(beresp == 'R' || beresp == 'E'))
+				{
+					appendGTMPQExpBuffer(&conn->errorMessage,
+									  "expected authentication request from "
+												"server, but received %c\n",
+									  beresp);
+					goto error_return;
+				}
+
+
+				/* Handle errors. */
+				if (beresp == 'E')
+				{
+					if (gtmpqGets_append(&conn->errorMessage, conn))
+					{
+						/* We'll come back when there is more data */
+						return PGRES_POLLING_READING;
+					}
+					/* OK, we read the message; mark data consumed */
+					conn->inStart = conn->inCursor;
+					goto error_return;
+				}
+
+				{
+					/*
+					 * Server sends a dummy message body of size 4 bytes
+					 */
+					int tmp_int;
+					gtmpqGetInt(&tmp_int, 4, conn);
+				}
+
+				/*
+				 * OK, we successfully read the message; mark data consumed
+				 */
+				conn->inStart = conn->inCursor;
+
+				/* We are done with authentication exchange */
+				conn->status = CONNECTION_AUTH_OK;
+
+				/* Look to see if we have more data yet. */
+				goto keep_going;
+			}
+
+		case CONNECTION_AUTH_OK:
+			{
+				/* We can release the address list now. */
+				gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist);
+				conn->addrlist = NULL;
+				conn->addr_cur = NULL;
+
+				/* Otherwise, we are open for business! */
+				conn->status = CONNECTION_OK;
+				return PGRES_POLLING_OK;
+			}
+
+
+		default:
+			appendGTMPQExpBuffer(&conn->errorMessage,
+											"invalid connection state %c, "
+								 "probably indicative of memory corruption\n"
+											,
+							  conn->status);
+			goto error_return;
+	}
+
+	/* Unreachable */
+
+error_return:
+
+	/*
+	 * We used to close the socket at this point, but that makes it awkward
+	 * for those above us if they wish to remove this socket from their own
+	 * records (an fd_set for example).  We'll just have this socket closed
+	 * when GTMPQfinish is called (which is compulsory even after an error, since
+	 * the connection structure must be freed).
+	 */
+	conn->status = CONNECTION_BAD;
+	return PGRES_POLLING_FAILED;
+}
+
+
+/*
+ * makeEmptyGTM_Conn
+ *	 - create a GTM_Conn data structure with (as yet) no interesting data
+ */
+static GTM_Conn *
+makeEmptyGTM_Conn(void)
+{
+	GTM_Conn	   *conn;
+
+	conn = (GTM_Conn *) malloc(sizeof(GTM_Conn));
+	if (conn == NULL)
+		return conn;
+
+	/* Zero all pointers and booleans */
+	MemSet(conn, 0, sizeof(GTM_Conn));
+
+	conn->status = CONNECTION_BAD;
+
+	/*
+	 * We try to send at least 8K at a time, which is the usual size of pipe
+	 * buffers on Unix systems.  That way, when we are sending a large amount
+	 * of data, we avoid incurring extra kernel context swaps for partial
+	 * bufferloads.  The output buffer is initially made 16K in size, and we
+	 * try to dump it after accumulating 8K.
+	 *
+	 * With the same goal of minimizing context swaps, the input buffer will
+	 * be enlarged anytime it has less than 8K free, so we initially allocate
+	 * twice that.
+	 */
+	conn->inBufSize = 16 * 1024;
+	conn->inBuffer = (char *) malloc(conn->inBufSize);
+	conn->outBufSize = 16 * 1024;
+	conn->outBuffer = (char *) malloc(conn->outBufSize);
+	initGTMPQExpBuffer(&conn->errorMessage);
+	initGTMPQExpBuffer(&conn->workBuffer);
+
+	if (conn->inBuffer == NULL ||
+		conn->outBuffer == NULL ||
+		PQExpBufferBroken(&conn->errorMessage) ||
+		PQExpBufferBroken(&conn->workBuffer))
+	{
+		/* out of memory already :-( */
+		freeGTM_Conn(conn);
+		conn = NULL;
+	}
+
+	return conn;
+}
+
+/*
+ * freeGTM_Conn
+ *	 - free an idle (closed) GTM_Conn data structure
+ *
+ * NOTE: this should not overlap any functionality with closeGTM_Conn().
+ * Clearing/resetting of transient state belongs there; what we do here is
+ * release data that is to be held for the life of the GTM_Conn structure.
+ * If a value ought to be cleared/freed during PQreset(), do it there not here.
+ */
+static void
+freeGTM_Conn(GTM_Conn *conn)
+{
+	if (conn->pghost)
+		free(conn->pghost);
+	if (conn->pghostaddr)
+		free(conn->pghostaddr);
+	if (conn->pgport)
+		free(conn->pgport);
+	if (conn->connect_timeout)
+		free(conn->connect_timeout);
+	if (conn->inBuffer)
+		free(conn->inBuffer);
+	if (conn->outBuffer)
+		free(conn->outBuffer);
+	termGTMPQExpBuffer(&conn->errorMessage);
+	termGTMPQExpBuffer(&conn->workBuffer);
+
+	free(conn);
+}
+
+/*
+ * closeGTM_Conn
+ *	 - properly close a connection to the backend
+ *
+ * This should reset or release all transient state, but NOT the connection
+ * parameters.  On exit, the GTM_Conn should be in condition to start a fresh
+ * connection with the same parameters (see PQreset()).
+ */
+static void
+closeGTM_Conn(GTM_Conn *conn)
+{
+	/*
+	 * Note that the protocol doesn't allow us to send Terminate messages
+	 * during the startup phase.
+	 */
+	if (conn->sock >= 0 && conn->status == CONNECTION_OK)
+	{
+		/*
+		 * Try to send "close connection" message to backend. Ignore any
+		 * error.
+		 *
+		 * Force length word for backends may try to read that in a generic
+		 * code
+		 */
+		gtmpqPutMsgStart('X', true, conn);
+		gtmpqPutMsgEnd(conn);
+		gtmpqFlush(conn);
+	}
+
+	/*
+	 * Close the connection, reset all transient state, flush I/O buffers.
+	 */
+	if (conn->sock >= 0)
+		close(conn->sock);
+	conn->sock = -1;
+	conn->status = CONNECTION_BAD;		/* Well, not really _bad_ - just
+										 * absent */
+	gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist);
+	conn->addrlist = NULL;
+	conn->addr_cur = NULL;
+	conn->inStart = conn->inCursor = conn->inEnd = 0;
+	conn->outCount = 0;
+}
+
+/*
+ * GTMPQfinish: properly close a connection to the backend. Also frees
+ * the GTM_Conn data structure so it shouldn't be re-used after this.
+ */
+void
+GTMPQfinish(GTM_Conn *conn)
+{
+	if (conn)
+	{
+		closeGTM_Conn(conn);
+		freeGTM_Conn(conn);
+	}
+}
+
+/*
+ * pqPacketSend() -- convenience routine to send a message to server.
+ *
+ * pack_type: the single-byte message type code.  (Pass zero for startup
+ * packets, which have no message type code.)
+ *
+ * buf, buf_len: contents of message.  The given length includes only what
+ * is in buf; the message type and message length fields are added here.
+ *
+ * RETURNS: STATUS_ERROR if the write fails, STATUS_OK otherwise.
+ * SIDE_EFFECTS: may block.
+ *
+ * Note: all messages sent with this routine have a length word, whether
+ * it's protocol 2.0 or 3.0.
+ */
+static int
+pqPacketSend(GTM_Conn *conn, char pack_type,
+			 const void *buf, size_t buf_len)
+{
+	/* Start the message. */
+	if (gtmpqPutMsgStart(pack_type, true, conn))
+		return STATUS_ERROR;
+
+	/* Send the message body. */
+	if (gtmpqPutnchar(buf, buf_len, conn))
+		return STATUS_ERROR;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		return STATUS_ERROR;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		return STATUS_ERROR;
+
+	return STATUS_OK;
+}
+
+
+/*
+ *		GTMPQconninfoParse
+ *
+ * Parse a string like PQconnectGTM() would do and return the
+ * resulting connection options array.  NULL is returned on failure.
+ * The result contains only options specified directly in the string,
+ * not any possible default values.
+ *
+ * If errmsg isn't NULL, *errmsg is set to NULL on success, or a malloc'd
+ * string on failure (use PQfreemem to free it).  In out-of-memory conditions
+ * both *errmsg and the result could be NULL.
+ *
+ * NOTE: the returned array is dynamically allocated and should
+ * be freed when no longer needed via GTMPQconninfoFree().
+ */
+GTMPQconninfoOption *
+GTMPQconninfoParse(const char *conninfo, char **errmsg)
+{
+	PQExpBufferData errorBuf;
+	GTMPQconninfoOption *connOptions;
+
+	if (errmsg)
+		*errmsg = NULL;			/* default */
+	initGTMPQExpBuffer(&errorBuf);
+	if (PQExpBufferBroken(&errorBuf))
+		return NULL;			/* out of memory already :-( */
+	connOptions = conninfo_parse(conninfo, &errorBuf, false);
+	if (connOptions == NULL && errmsg)
+		*errmsg = errorBuf.data;
+	else
+		termGTMPQExpBuffer(&errorBuf);
+	return connOptions;
+}
+
+/*
+ * Conninfo parser routine
+ *
+ * If successful, a malloc'd GTMPQconninfoOption array is returned.
+ * If not successful, NULL is returned and an error message is
+ * left in errorMessage.
+ * Defaults are supplied (from a service file, environment variables, etc)
+ * for unspecified options, but only if use_defaults is TRUE.
+ */
+static GTMPQconninfoOption *
+conninfo_parse(const char *conninfo, PQExpBuffer errorMessage,
+			   bool use_defaults)
+{
+	char	   *pname;
+	char	   *pval;
+	char	   *buf;
+	char	   *cp;
+	char	   *cp2;
+	GTMPQconninfoOption *options;
+	GTMPQconninfoOption *option;
+
+	/* Make a working copy of GTMPQconninfoOptions */
+	options = malloc(sizeof(GTMPQconninfoOptions));
+	if (options == NULL)
+	{
+		printfGTMPQExpBuffer(errorMessage,
+						  libpq_gettext("out of memory\n"));
+		return NULL;
+	}
+	memcpy(options, GTMPQconninfoOptions, sizeof(GTMPQconninfoOptions));
+
+	/* Need a modifiable copy of the input string */
+	if ((buf = strdup(conninfo)) == NULL)
+	{
+		printfGTMPQExpBuffer(errorMessage,
+						  libpq_gettext("out of memory\n"));
+		GTMPQconninfoFree(options);
+		return NULL;
+	}
+	cp = buf;
+
+	while (*cp)
+	{
+		/* Skip blanks before the parameter name */
+		if (isspace((unsigned char) *cp))
+		{
+			cp++;
+			continue;
+		}
+
+		/* Get the parameter name */
+		pname = cp;
+		while (*cp)
+		{
+			if (*cp == '=')
+				break;
+			if (isspace((unsigned char) *cp))
+			{
+				*cp++ = '\0';
+				while (*cp)
+				{
+					if (!isspace((unsigned char) *cp))
+						break;
+					cp++;
+				}
+				break;
+			}
+			cp++;
+		}
+
+		/* Check that there is a following '=' */
+		if (*cp != '=')
+		{
+			printfGTMPQExpBuffer(errorMessage,
+							  libpq_gettext("missing \"=\" after \"%s\" in connection info string\n"),
+							  pname);
+			GTMPQconninfoFree(options);
+			free(buf);
+			return NULL;
+		}
+		*cp++ = '\0';
+
+		/* Skip blanks after the '=' */
+		while (*cp)
+		{
+			if (!isspace((unsigned char) *cp))
+				break;
+			cp++;
+		}
+
+		/* Get the parameter value */
+		pval = cp;
+
+		if (*cp != '\'')
+		{
+			cp2 = pval;
+			while (*cp)
+			{
+				if (isspace((unsigned char) *cp))
+				{
+					*cp++ = '\0';
+					break;
+				}
+				if (*cp == '\\')
+				{
+					cp++;
+					if (*cp != '\0')
+						*cp2++ = *cp++;
+				}
+				else
+					*cp2++ = *cp++;
+			}
+			*cp2 = '\0';
+		}
+		else
+		{
+			cp2 = pval;
+			cp++;
+			for (;;)
+			{
+				if (*cp == '\0')
+				{
+					printfGTMPQExpBuffer(errorMessage,
+									  libpq_gettext("unterminated quoted string in connection info string\n"));
+					GTMPQconninfoFree(options);
+					free(buf);
+					return NULL;
+				}
+				if (*cp == '\\')
+				{
+					cp++;
+					if (*cp != '\0')
+						*cp2++ = *cp++;
+					continue;
+				}
+				if (*cp == '\'')
+				{
+					*cp2 = '\0';
+					cp++;
+					break;
+				}
+				*cp2++ = *cp++;
+			}
+		}
+
+		/*
+		 * Now we have the name and the value. Search for the param record.
+		 */
+		for (option = options; option->keyword != NULL; option++)
+		{
+			if (strcmp(option->keyword, pname) == 0)
+				break;
+		}
+		if (option->keyword == NULL)
+		{
+			printfGTMPQExpBuffer(errorMessage,
+						 libpq_gettext("invalid connection option \"%s\"\n"),
+							  pname);
+			GTMPQconninfoFree(options);
+			free(buf);
+			return NULL;
+		}
+
+		/*
+		 * Store the value
+		 */
+		if (option->val)
+			free(option->val);
+		option->val = strdup(pval);
+		if (!option->val)
+		{
+			printfGTMPQExpBuffer(errorMessage,
+							  libpq_gettext("out of memory\n"));
+			GTMPQconninfoFree(options);
+			free(buf);
+			return NULL;
+		}
+	}
+
+	/* Done with the modifiable input string */
+	free(buf);
+
+	return options;
+}
+
+
+static char *
+conninfo_getval(GTMPQconninfoOption *connOptions,
+				const char *keyword)
+{
+	GTMPQconninfoOption *option;
+
+	for (option = connOptions; option->keyword != NULL; option++)
+	{
+		if (strcmp(option->keyword, keyword) == 0)
+			return option->val;
+	}
+
+	return NULL;
+}
+
+
+void
+GTMPQconninfoFree(GTMPQconninfoOption *connOptions)
+{
+	GTMPQconninfoOption *option;
+
+	if (connOptions == NULL)
+		return;
+
+	for (option = connOptions; option->keyword != NULL; option++)
+	{
+		if (option->val != NULL)
+			free(option->val);
+	}
+	free(connOptions);
+}
+
+char *
+GTMPQhost(const GTM_Conn *conn)
+{
+	if (!conn)
+		return NULL;
+	return conn->pghost;
+}
+
+char *
+GTMPQport(const GTM_Conn *conn)
+{
+	if (!conn)
+		return NULL;
+	return conn->pgport;
+}
+
+ConnStatusType
+GTMPQstatus(const GTM_Conn *conn)
+{
+	if (!conn)
+		return CONNECTION_BAD;
+	return conn->status;
+}
+
+char *
+GTMPQerrorMessage(const GTM_Conn *conn)
+{
+	if (!conn)
+		return libpq_gettext("connection pointer is NULL\n");
+
+	return conn->errorMessage.data;
+}
+
+int
+GTMPQsocket(const GTM_Conn *conn)
+{
+	if (!conn)
+		return -1;
+	return conn->sock;
+}
+
+void
+GTMPQtrace(GTM_Conn *conn, FILE *debug_port)
+{
+	if (conn == NULL)
+		return;
+	GTMPQuntrace(conn);
+	conn->Pfdebug = debug_port;
+}
+
+void
+GTMPQuntrace(GTM_Conn *conn)
+{
+	if (conn == NULL)
+		return;
+	if (conn->Pfdebug)
+	{
+		fflush(conn->Pfdebug);
+		conn->Pfdebug = NULL;
+	}
+}
diff --git a/src/gtm/client/fe-misc.c b/src/gtm/client/fe-misc.c
new file mode 100644
index 0000000000..66172400a5
--- /dev/null
+++ b/src/gtm/client/fe-misc.c
@@ -0,0 +1,1035 @@
+/*-------------------------------------------------------------------------
+ *
+ *	 FILE
+ *		fe-misc.c
+ *
+ *	 DESCRIPTION
+ *		 miscellaneous useful functions
+ *
+ * The communication routines here are analogous to the ones in
+ * backend/libpq/pqcomm.c and backend/libpq/pqcomprim.c, but operate
+ * in the considerably different environment of the frontend libpq.
+ * In particular, we work with a bare nonblock-mode socket, rather than
+ * a stdio stream, so that we can avoid unwanted blocking of the application.
+ *
+ * XXX: MOVE DEBUG PRINTOUT TO HIGHER LEVEL.  As is, block and restart
+ * will cause repeat printouts.
+ *
+ * We must speak the same transmitted data representations as the backend
+ * routines.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/interfaces/libpq/fe-misc.c,v 1.137 2008/12/11 07:34:09 petere Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <signal.h>
+#include <time.h>
+
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <poll.h>
+#include <sys/poll.h>
+#include <sys/select.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+
+static int	gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn);
+static int	gtmpqSendSome(GTM_Conn *conn, int len);
+static int gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite,
+			  time_t end_time);
+static int	gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time);
+
+
+/*
+ * gtmpqGetc: get 1 character from the connection
+ *
+ *	All these routines return 0 on success, EOF on error.
+ *	Note that for the Get routines, EOF only means there is not enough
+ *	data in the buffer, not that there is necessarily a hard error.
+ */
+int
+gtmpqGetc(char *result, GTM_Conn *conn)
+{
+	if (conn->inCursor >= conn->inEnd)
+		return EOF;
+
+	*result = conn->inBuffer[conn->inCursor++];
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "From backend> %c\n", *result);
+
+	return 0;
+}
+
+
+/*
+ * gtmpqPutc: write 1 char to the current message
+ */
+int
+gtmpqPutc(char c, GTM_Conn *conn)
+{
+	if (gtmpqPutMsgBytes(&c, 1, conn))
+		return EOF;
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend> %c\n", c);
+
+	return 0;
+}
+
+
+/*
+ * gtmpqGets[_append]:
+ * get a null-terminated string from the connection,
+ * and store it in an expansible PQExpBuffer.
+ * If we run out of memory, all of the string is still read,
+ * but the excess characters are silently discarded.
+ */
+static int
+gtmpqGets_internal(PQExpBuffer buf, GTM_Conn *conn, bool resetbuffer)
+{
+	/* Copy conn data to locals for faster search loop */
+	char	   *inBuffer = conn->inBuffer;
+	int			inCursor = conn->inCursor;
+	int			inEnd = conn->inEnd;
+	int			slen;
+
+	while (inCursor < inEnd && inBuffer[inCursor])
+		inCursor++;
+
+	if (inCursor >= inEnd)
+		return EOF;
+
+	slen = inCursor - conn->inCursor;
+
+	if (resetbuffer)
+		resetGTMPQExpBuffer(buf);
+
+	appendBinaryGTMPQExpBuffer(buf, inBuffer + conn->inCursor, slen);
+
+	conn->inCursor = ++inCursor;
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "From backend> \"%s\"\n",
+				buf->data);
+
+	return 0;
+}
+
+int
+gtmpqGets(PQExpBuffer buf, GTM_Conn *conn)
+{
+	return gtmpqGets_internal(buf, conn, true);
+}
+
+int
+gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn)
+{
+	return gtmpqGets_internal(buf, conn, false);
+}
+
+
+/*
+ * gtmpqPuts: write a null-terminated string to the current message
+ */
+int
+gtmpqPuts(const char *s, GTM_Conn *conn)
+{
+	if (gtmpqPutMsgBytes(s, strlen(s) + 1, conn))
+		return EOF;
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend> \"%s\"\n", s);
+
+	return 0;
+}
+
+/*
+ * gtmpqGetnchar:
+ *	get a string of exactly len bytes in buffer s, no null termination
+ */
+int
+gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn)
+{
+	if (len < 0 || len > (size_t) (conn->inEnd - conn->inCursor))
+		return EOF;
+
+	memcpy(s, conn->inBuffer + conn->inCursor, len);
+	/* no terminating null */
+
+	conn->inCursor += len;
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "From backend (%lu)> %.*s\n",
+				(unsigned long) len, (int) len, s);
+
+	return 0;
+}
+
+/*
+ * gtmpqPutnchar:
+ *	write exactly len bytes to the current message
+ */
+int
+gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn)
+{
+	if (gtmpqPutMsgBytes(s, len, conn))
+		return EOF;
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend> %.*s\n", (int) len, s);
+
+	return 0;
+}
+
+/*
+ * gtmpqGetInt
+ *	read a 2 or 4 byte integer and convert from network byte order
+ *	to local byte order
+ */
+int
+gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn)
+{
+	uint16		tmp2;
+	uint32		tmp4;
+
+	switch (bytes)
+	{
+		case 2:
+			if (conn->inCursor + 2 > conn->inEnd)
+				return EOF;
+			memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+			conn->inCursor += 2;
+			*result = (int) ntohs(tmp2);
+			break;
+		case 4:
+			if (conn->inCursor + 4 > conn->inEnd)
+				return EOF;
+			memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+			conn->inCursor += 4;
+			*result = (int) ntohl(tmp4);
+			break;
+		default:
+			fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes);
+			return EOF;
+	}
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "From backend (#%lu)> %d\n", (unsigned long) bytes, *result);
+
+	return 0;
+}
+
+/*
+ * gtmpqPutInt
+ * write an integer of 2 or 4 bytes, converting from host byte order
+ * to network byte order.
+ */
+int
+gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn)
+{
+	uint16		tmp2;
+	uint32		tmp4;
+
+	switch (bytes)
+	{
+		case 2:
+			tmp2 = htons((uint16) value);
+			if (gtmpqPutMsgBytes((const char *) &tmp2, 2, conn))
+				return EOF;
+			break;
+		case 4:
+			tmp4 = htonl((uint32) value);
+			if (gtmpqPutMsgBytes((const char *) &tmp4, 4, conn))
+				return EOF;
+			break;
+		default:
+			fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes);
+			return EOF;
+	}
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend (%lu#)> %d\n", (unsigned long) bytes, value);
+
+	return 0;
+}
+
+/*
+ * Make sure conn's output buffer can hold bytes_needed bytes (caller must
+ * include already-stored data into the value!)
+ *
+ * Returns 0 on success, EOF if failed to enlarge buffer
+ */
+int
+gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn)
+{
+	int			newsize = conn->outBufSize;
+	char	   *newbuf;
+
+	if (bytes_needed <= (size_t) newsize)
+		return 0;
+
+	/*
+	 * If we need to enlarge the buffer, we first try to double it in size; if
+	 * that doesn't work, enlarge in multiples of 8K.  This avoids thrashing
+	 * the malloc pool by repeated small enlargements.
+	 *
+	 * Note: tests for newsize > 0 are to catch integer overflow.
+	 */
+	do
+	{
+		newsize *= 2;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = realloc(conn->outBuffer, newsize);
+		if (newbuf)
+		{
+			/* realloc succeeded */
+			conn->outBuffer = newbuf;
+			conn->outBufSize = newsize;
+			return 0;
+		}
+	}
+
+	newsize = conn->outBufSize;
+	do
+	{
+		newsize += 8192;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = realloc(conn->outBuffer, newsize);
+		if (newbuf)
+		{
+			/* realloc succeeded */
+			conn->outBuffer = newbuf;
+			conn->outBufSize = newsize;
+			return 0;
+		}
+	}
+
+	/* realloc failed. Probably out of memory */
+	printfGTMPQExpBuffer(&conn->errorMessage,
+					  "cannot allocate memory for output buffer\n");
+	return EOF;
+}
+
+/*
+ * Make sure conn's input buffer can hold bytes_needed bytes (caller must
+ * include already-stored data into the value!)
+ *
+ * Returns 0 on success, EOF if failed to enlarge buffer
+ */
+int
+gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn)
+{
+	int			newsize = conn->inBufSize;
+	char	   *newbuf;
+
+	if (bytes_needed <= (size_t) newsize)
+		return 0;
+
+	/*
+	 * If we need to enlarge the buffer, we first try to double it in size; if
+	 * that doesn't work, enlarge in multiples of 8K.  This avoids thrashing
+	 * the malloc pool by repeated small enlargements.
+	 *
+	 * Note: tests for newsize > 0 are to catch integer overflow.
+	 */
+	do
+	{
+		newsize *= 2;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = realloc(conn->inBuffer, newsize);
+		if (newbuf)
+		{
+			/* realloc succeeded */
+			conn->inBuffer = newbuf;
+			conn->inBufSize = newsize;
+			return 0;
+		}
+	}
+
+	newsize = conn->inBufSize;
+	do
+	{
+		newsize += 8192;
+	} while (newsize > 0 && bytes_needed > (size_t) newsize);
+
+	if (newsize > 0 && bytes_needed <= (size_t) newsize)
+	{
+		newbuf = realloc(conn->inBuffer, newsize);
+		if (newbuf)
+		{
+			/* realloc succeeded */
+			conn->inBuffer = newbuf;
+			conn->inBufSize = newsize;
+			return 0;
+		}
+	}
+
+	/* realloc failed. Probably out of memory */
+	printfGTMPQExpBuffer(&conn->errorMessage,
+					  "cannot allocate memory for input buffer\n");
+	return EOF;
+}
+
+/*
+ * gtmpqPutMsgStart: begin construction of a message to the server
+ *
+ * msg_type is the message type byte, or 0 for a message without type byte
+ * (only startup messages have no type byte)
+ *
+ * force_len forces the message to have a length word; otherwise, we add
+ * a length word if protocol 3.
+ *
+ * Returns 0 on success, EOF on error
+ *
+ * The idea here is that we construct the message in conn->outBuffer,
+ * beginning just past any data already in outBuffer (ie, at
+ * outBuffer+outCount).  We enlarge the buffer as needed to hold the message.
+ * When the message is complete, we fill in the length word (if needed) and
+ * then advance outCount past the message, making it eligible to send.
+ *
+ * The state variable conn->outMsgStart points to the incomplete message's
+ * length word: it is either outCount or outCount+1 depending on whether
+ * there is a type byte.  If we are sending a message without length word
+ * (pre protocol 3.0 only), then outMsgStart is -1.  The state variable
+ * conn->outMsgEnd is the end of the data collected so far.
+ */
+int
+gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn)
+{
+	int			lenPos;
+	int			endPos;
+
+	/* allow room for message type byte */
+	if (msg_type)
+		endPos = conn->outCount + 1;
+	else
+		endPos = conn->outCount;
+
+	/* do we want a length word? */
+	if (force_len)
+	{
+		lenPos = endPos;
+		/* allow room for message length */
+		endPos += 4;
+	}
+	else
+		lenPos = -1;
+
+	/* make sure there is room for message header */
+	if (gtmpqCheckOutBufferSpace(endPos, conn))
+		return EOF;
+	/* okay, save the message type byte if any */
+	if (msg_type)
+		conn->outBuffer[conn->outCount] = msg_type;
+	/* set up the message pointers */
+	conn->outMsgStart = lenPos;
+	conn->outMsgEnd = endPos;
+	/* length word, if needed, will be filled in by gtmpqPutMsgEnd */
+
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend> Msg %c\n",
+				msg_type ? msg_type : ' ');
+
+	return 0;
+}
+
+/*
+ * gtmpqPutMsgBytes: add bytes to a partially-constructed message
+ *
+ * Returns 0 on success, EOF on error
+ */
+static int
+gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn)
+{
+	/* make sure there is room for it */
+	if (gtmpqCheckOutBufferSpace(conn->outMsgEnd + len, conn))
+		return EOF;
+	/* okay, save the data */
+	memcpy(conn->outBuffer + conn->outMsgEnd, buf, len);
+	conn->outMsgEnd += len;
+	/* no Pfdebug call here, caller should do it */
+	return 0;
+}
+
+/*
+ * gtmpqPutMsgEnd: finish constructing a message and possibly send it
+ *
+ * Returns 0 on success, EOF on error
+ *
+ * We don't actually send anything here unless we've accumulated at least
+ * 8K worth of data (the typical size of a pipe buffer on Unix systems).
+ * This avoids sending small partial packets.  The caller must use gtmpqFlush
+ * when it's important to flush all the data out to the server.
+ */
+int
+gtmpqPutMsgEnd(GTM_Conn *conn)
+{
+	if (conn->Pfdebug)
+		fprintf(conn->Pfdebug, "To backend> Msg complete, length %u\n",
+				conn->outMsgEnd - conn->outCount);
+
+	/* Fill in length word if needed */
+	if (conn->outMsgStart >= 0)
+	{
+		uint32		msgLen = conn->outMsgEnd - conn->outMsgStart;
+
+		msgLen = htonl(msgLen);
+		memcpy(conn->outBuffer + conn->outMsgStart, &msgLen, 4);
+	}
+
+	/* Make message eligible to send */
+	conn->outCount = conn->outMsgEnd;
+
+	if (conn->outCount >= 8192)
+	{
+		int			toSend = conn->outCount - (conn->outCount % 8192);
+
+		if (gtmpqSendSome(conn, toSend) < 0)
+			return EOF;
+		/* in nonblock mode, don't complain if unable to send it all */
+	}
+
+	return 0;
+}
+
+/* ----------
+ * gtmpqReadData: read more data, if any is available
+ * Possible return values:
+ *	 1: successfully loaded at least one more byte
+ *	 0: no data is presently available, but no error detected
+ *	-1: error detected (including EOF = connection closure);
+ *		conn->errorMessage set
+ * NOTE: callers must not assume that pointers or indexes into conn->inBuffer
+ * remain valid across this call!
+ * ----------
+ */
+int
+gtmpqReadData(GTM_Conn *conn)
+{
+	int			someread = 0;
+	int			nread;
+
+	if (conn->sock < 0)
+	{
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "connection not open\n");
+		return -1;
+	}
+
+	/* Left-justify any data in the buffer to make room */
+	if (conn->inStart < conn->inEnd)
+	{
+		if (conn->inStart > 0)
+		{
+			memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+					conn->inEnd - conn->inStart);
+			conn->inEnd -= conn->inStart;
+			conn->inCursor -= conn->inStart;
+			conn->inStart = 0;
+		}
+	}
+	else
+	{
+		/* buffer is logically empty, reset it */
+		conn->inStart = conn->inCursor = conn->inEnd = 0;
+	}
+
+	/*
+	 * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+	 * the buffer in case a single message exceeds the initial buffer size. We
+	 * enlarge before filling the buffer entirely so as to avoid asking the
+	 * kernel for a partial packet. The magic constant here should be large
+	 * enough for a TCP packet or Unix pipe bufferload.  8K is the usual pipe
+	 * buffer size, so...
+	 */
+	if (conn->inBufSize - conn->inEnd < 8192)
+	{
+		if (gtmpqCheckInBufferSpace(conn->inEnd + (size_t) 8192, conn))
+		{
+			/*
+			 * We don't insist that the enlarge worked, but we need some room
+			 */
+			if (conn->inBufSize - conn->inEnd < 100)
+				return -1;		/* errorMessage already set */
+		}
+	}
+
+	/* OK, try to read some data */
+retry3:
+	nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+						  conn->inBufSize - conn->inEnd, 0);
+	if (nread < 0)
+	{
+		if (SOCK_ERRNO == EINTR)
+			goto retry3;
+		/* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+		if (SOCK_ERRNO == EAGAIN)
+			return someread;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+		if (SOCK_ERRNO == EWOULDBLOCK)
+			return someread;
+#endif
+		/* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+		if (SOCK_ERRNO == ECONNRESET)
+			goto definitelyFailed;
+#endif
+		printfGTMPQExpBuffer(&conn->errorMessage,
+				   "could not receive data from server:\n");
+		return -1;
+	}
+	if (nread > 0)
+	{
+		conn->inEnd += nread;
+
+		/*
+		 * Hack to deal with the fact that some kernels will only give us back
+		 * 1 packet per recv() call, even if we asked for more and there is
+		 * more available.	If it looks like we are reading a long message,
+		 * loop back to recv() again immediately, until we run out of data or
+		 * buffer space.  Without this, the block-and-restart behavior of
+		 * libpq's higher levels leads to O(N^2) performance on long messages.
+		 *
+		 * Since we left-justified the data above, conn->inEnd gives the
+		 * amount of data already read in the current message.	We consider
+		 * the message "long" once we have acquired 32k ...
+		 */
+		if (conn->inEnd > 32768 &&
+			(conn->inBufSize - conn->inEnd) >= 8192)
+		{
+			someread = 1;
+			goto retry3;
+		}
+		return 1;
+	}
+
+	if (someread)
+		return 1;				/* got a zero read after successful tries */
+
+	/*
+	 * A return value of 0 could mean just that no data is now available, or
+	 * it could mean EOF --- that is, the server has closed the connection.
+	 * Since we have the socket in nonblock mode, the only way to tell the
+	 * difference is to see if select() is saying that the file is ready.
+	 * Grumble.  Fortunately, we don't expect this path to be taken much,
+	 * since in normal practice we should not be trying to read data unless
+	 * the file selected for reading already.
+	 *
+	 * In SSL mode it's even worse: SSL_read() could say WANT_READ and then
+	 * data could arrive before we make the gtmpqReadReady() test.  So we must
+	 * play dumb and assume there is more data, relying on the SSL layer to
+	 * detect true EOF.
+	 */
+
+#ifdef USE_SSL
+	if (conn->ssl)
+		return 0;
+#endif
+
+	switch (gtmpqReadReady(conn))
+	{
+		case 0:
+			/* definitely no data available */
+			return 0;
+		case 1:
+			/* ready for read */
+			break;
+		default:
+			goto definitelyFailed;
+	}
+
+	/*
+	 * Still not sure that it's EOF, because some data could have just
+	 * arrived.
+	 */
+retry4:
+	nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+						  conn->inBufSize - conn->inEnd, 0);
+	if (nread < 0)
+	{
+		if (SOCK_ERRNO == EINTR)
+			goto retry4;
+		/* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+		if (SOCK_ERRNO == EAGAIN)
+			return 0;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+		if (SOCK_ERRNO == EWOULDBLOCK)
+			return 0;
+#endif
+		/* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+		if (SOCK_ERRNO == ECONNRESET)
+			goto definitelyFailed;
+#endif
+		printfGTMPQExpBuffer(&conn->errorMessage,
+				   "could not receive data from server: \n");
+		return -1;
+	}
+	if (nread > 0)
+	{
+		conn->inEnd += nread;
+		return 1;
+	}
+
+	/*
+	 * OK, we are getting a zero read even though select() says ready. This
+	 * means the connection has been closed.  Cope.
+	 */
+definitelyFailed:
+	printfGTMPQExpBuffer(&conn->errorMessage,
+								"server closed the connection unexpectedly\n"
+				   "\tThis probably means the server terminated abnormally\n"
+							 "\tbefore or while processing the request.\n");
+	conn->status = CONNECTION_BAD;		/* No more connection to backend */
+	close(conn->sock);
+	conn->sock = -1;
+
+	return -1;
+}
+
+/*
+ * gtmpqSendSome: send data waiting in the output buffer.
+ *
+ * len is how much to try to send (typically equal to outCount, but may
+ * be less).
+ *
+ * Return 0 on success, -1 on failure and 1 when not all data could be sent
+ * because the socket would block and the connection is non-blocking.
+ */
+static int
+gtmpqSendSome(GTM_Conn *conn, int len)
+{
+	char	   *ptr = conn->outBuffer;
+	int			remaining = conn->outCount;
+	int			result = 0;
+
+	if (conn->sock < 0)
+	{
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "connection not open\n");
+		return -1;
+	}
+
+	/* while there's still data to send */
+	while (len > 0)
+	{
+		int			sent;
+
+		sent = send(conn->sock, ptr, len, 0);
+
+		if (sent < 0)
+		{
+			/*
+			 * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+			 * EPIPE or ECONNRESET, assume we've lost the backend connection
+			 * permanently.
+			 */
+			switch (SOCK_ERRNO)
+			{
+#ifdef EAGAIN
+				case EAGAIN:
+					break;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+				case EWOULDBLOCK:
+					break;
+#endif
+				case EINTR:
+					continue;
+
+				case EPIPE:
+#ifdef ECONNRESET
+				case ECONNRESET:
+#endif
+					printfGTMPQExpBuffer(&conn->errorMessage,
+								"server closed the connection unexpectedly\n"
+					"\tThis probably means the server terminated abnormally\n"
+							 "\tbefore or while processing the request.\n");
+
+					/*
+					 * We used to close the socket here, but that's a bad idea
+					 * since there might be unread data waiting (typically, a
+					 * NOTICE message from the backend telling us it's
+					 * committing hara-kiri...).  Leave the socket open until
+					 * gtmpqReadData finds no more data can be read.  But abandon
+					 * attempt to send data.
+					 */
+					conn->outCount = 0;
+					return -1;
+
+				default:
+					printfGTMPQExpBuffer(&conn->errorMessage,
+						"could not send data to server: \n");
+					/* We don't assume it's a fatal error... */
+					conn->outCount = 0;
+					return -1;
+			}
+		}
+		else
+		{
+			ptr += sent;
+			len -= sent;
+			remaining -= sent;
+		}
+
+		if (len > 0)
+		{
+			/*
+			 * We didn't send it all, wait till we can send more.
+			 *
+			 * If the connection is in non-blocking mode we don't wait, but
+			 * return 1 to indicate that data is still pending.
+			 */
+			result = 1;
+			break;
+		}
+	}
+
+	/* shift the remaining contents of the buffer */
+	if (remaining > 0)
+		memmove(conn->outBuffer, ptr, remaining);
+	conn->outCount = remaining;
+
+	return result;
+}
+
+
+/*
+ * gtmpqFlush: send any data waiting in the output buffer
+ *
+ * Return 0 on success, -1 on failure and 1 when not all data could be sent
+ * because the socket would block and the connection is non-blocking.
+ */
+int
+gtmpqFlush(GTM_Conn *conn)
+{
+	if (conn->Pfdebug)
+		fflush(conn->Pfdebug);
+
+	if (conn->outCount > 0)
+		return gtmpqSendSome(conn, conn->outCount);
+
+	return 0;
+}
+
+
+/*
+ * gtmpqWait: wait until we can read or write the connection socket
+ *
+ * JAB: If SSL enabled and used and forRead, buffered bytes short-circuit the
+ * call to select().
+ *
+ * We also stop waiting and return if the kernel flags an exception condition
+ * on the socket.  The actual error condition will be detected and reported
+ * when the caller tries to read or write the socket.
+ */
+int
+gtmpqWait(int forRead, int forWrite, GTM_Conn *conn)
+{
+	return gtmpqWaitTimed(forRead, forWrite, conn, (time_t) -1);
+}
+
+/*
+ * gtmpqWaitTimed: wait, but not past finish_time.
+ *
+ * If finish_time is exceeded then we return failure (EOF).  This is like
+ * the response for a kernel exception because we don't want the caller
+ * to try to read/write in that case.
+ *
+ * finish_time = ((time_t) -1) disables the wait limit.
+ */
+int
+gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn, time_t finish_time)
+{
+	int			result;
+
+	result = gtmpqSocketCheck(conn, forRead, forWrite, finish_time);
+
+	if (result < 0)
+		return EOF;				/* errorMessage is already set */
+
+	if (result == 0)
+	{
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "timeout expired\n");
+		return EOF;
+	}
+
+	return 0;
+}
+
+/*
+ * gtmpqReadReady: is select() saying the file is ready to read?
+ * Returns -1 on failure, 0 if not ready, 1 if ready.
+ */
+int
+gtmpqReadReady(GTM_Conn *conn)
+{
+	return gtmpqSocketCheck(conn, 1, 0, (time_t) 0);
+}
+
+/*
+ * gtmpqWriteReady: is select() saying the file is ready to write?
+ * Returns -1 on failure, 0 if not ready, 1 if ready.
+ */
+int
+gtmpqWriteReady(GTM_Conn *conn)
+{
+	return gtmpqSocketCheck(conn, 0, 1, (time_t) 0);
+}
+
+/*
+ * Checks a socket, using poll or select, for data to be read, written,
+ * or both.  Returns >0 if one or more conditions are met, 0 if it timed
+ * out, -1 if an error occurred.
+ *
+ * If SSL is in use, the SSL buffer is checked prior to checking the socket
+ * for read data directly.
+ */
+static int
+gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite, time_t end_time)
+{
+	int			result;
+
+	if (!conn)
+		return -1;
+	if (conn->sock < 0)
+	{
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "socket not open\n");
+		return -1;
+	}
+
+#ifdef USE_SSL
+	/* Check for SSL library buffering read bytes */
+	if (forRead && conn->ssl && SSL_pending(conn->ssl) > 0)
+	{
+		/* short-circuit the select */
+		return 1;
+	}
+#endif
+
+	/* We will retry as long as we get EINTR */
+	do
+		result = gtmpqSocketPoll(conn->sock, forRead, forWrite, end_time);
+	while (result < 0 && SOCK_ERRNO == EINTR);
+
+	if (result < 0)
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "select() failed: \n");
+
+	return result;
+}
+
+
+/*
+ * Check a file descriptor for read and/or write data, possibly waiting.
+ * If neither forRead nor forWrite are set, immediately return a timeout
+ * condition (without waiting).  Return >0 if condition is met, 0
+ * if a timeout occurred, -1 if an error or interrupt occurred.
+ *
+ * Timeout is infinite if end_time is -1.  Timeout is immediate (no blocking)
+ * if end_time is 0 (or indeed, any time before now).
+ */
+static int
+gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time)
+{
+	/* We use poll(2) if available, otherwise select(2) */
+#ifdef HAVE_POLL
+	struct pollfd input_fd;
+	int			timeout_ms;
+
+	if (!forRead && !forWrite)
+		return 0;
+
+	input_fd.fd = sock;
+	input_fd.events = POLLERR;
+	input_fd.revents = 0;
+
+	if (forRead)
+		input_fd.events |= POLLIN;
+	if (forWrite)
+		input_fd.events |= POLLOUT;
+
+	/* Compute appropriate timeout interval */
+	if (end_time == ((time_t) -1))
+		timeout_ms = -1;
+	else
+	{
+		time_t		now = time(NULL);
+
+		if (end_time > now)
+			timeout_ms = (end_time - now) * 1000;
+		else
+			timeout_ms = 0;
+	}
+
+	return poll(&input_fd, 1, timeout_ms);
+#else							/* !HAVE_POLL */
+
+	fd_set		input_mask;
+	fd_set		output_mask;
+	fd_set		except_mask;
+	struct timeval timeout;
+	struct timeval *ptr_timeout;
+
+	if (!forRead && !forWrite)
+		return 0;
+
+	FD_ZERO(&input_mask);
+	FD_ZERO(&output_mask);
+	FD_ZERO(&except_mask);
+	if (forRead)
+		FD_SET(sock, &input_mask);
+	if (forWrite)
+		FD_SET(sock, &output_mask);
+	FD_SET(sock, &except_mask);
+
+	/* Compute appropriate timeout interval */
+	if (end_time == ((time_t) -1))
+		ptr_timeout = NULL;
+	else
+	{
+		time_t		now = time(NULL);
+
+		if (end_time > now)
+			timeout.tv_sec = end_time - now;
+		else
+			timeout.tv_sec = 0;
+		timeout.tv_usec = 0;
+		ptr_timeout = &timeout;
+	}
+
+	return select(sock + 1, &input_mask, &output_mask,
+				  &except_mask, ptr_timeout);
+#endif   /* HAVE_POLL */
+}
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
new file mode 100644
index 0000000000..f3960daeaa
--- /dev/null
+++ b/src/gtm/client/fe-protocol.c
@@ -0,0 +1,598 @@
+/*-------------------------------------------------------------------------
+ *
+ * fe-protocol3.c
+ *	  functions that are specific to frontend/backend protocol version 3
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+
+#include <ctype.h>
+#include <fcntl.h>
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+#include "gtm/gtm_client.h"
+
+#include <unistd.h>
+#include <netinet/in.h>
+
+
+/*
+ * This macro lists the backend message types that could be "long" (more
+ * than a couple of kilobytes).
+ */
+#define VALID_LONG_MESSAGE_TYPE(id) \
+	((id) == 'S' || (id) == 'E')
+
+static void handleSyncLoss(GTM_Conn *conn, char id, int msgLength);
+static GTM_Result *pqParseInput(GTM_Conn *conn);
+static int gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result);
+static int gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn);
+
+/*
+ * parseInput: if appropriate, parse input data from backend
+ * until input is exhausted or a stopping state is reached.
+ * Note that this function will NOT attempt to read more data from the backend.
+ */
+static GTM_Result *
+pqParseInput(GTM_Conn *conn)
+{
+	char		id;
+	int			msgLength;
+	int			avail;
+	GTM_Result	*result = NULL;
+
+	if (conn->result == NULL)
+	{
+		conn->result = (GTM_Result *) malloc(sizeof (GTM_Result));
+		memset(conn->result, 0, sizeof (GTM_Result));
+	}
+	else
+		gtmpqFreeResultData(conn->result, conn->is_proxy);
+
+	result = conn->result;
+
+	/*
+	 * Try to read a message.  First get the type code and length. Return
+	 * if not enough data.
+	 */
+	conn->inCursor = conn->inStart;
+	if (gtmpqGetc(&id, conn))
+		return NULL;
+	if (gtmpqGetInt(&msgLength, 4, conn))
+		return NULL;
+
+	/*
+	 * Try to validate message type/length here.  A length less than 4 is
+	 * definitely broken.  Large lengths should only be believed for a few
+	 * message types.
+	 */
+	if (msgLength < 4)
+	{
+		handleSyncLoss(conn, id, msgLength);
+		return NULL;
+	}
+	if (msgLength > 30000 && !VALID_LONG_MESSAGE_TYPE(id))
+	{
+		handleSyncLoss(conn, id, msgLength);
+		return NULL;
+	}
+
+	/*
+	 * Can't process if message body isn't all here yet.
+	 */
+	conn->result->gr_msglen = msgLength -= 4;
+	avail = conn->inEnd - conn->inCursor;
+	if (avail < msgLength)
+	{
+		/*
+		 * Before returning, enlarge the input buffer if needed to hold
+		 * the whole message.  This is better than leaving it to
+		 * gtmpqReadData because we can avoid multiple cycles of realloc()
+		 * when the message is large; also, we can implement a reasonable
+		 * recovery strategy if we are unable to make the buffer big
+		 * enough.
+		 */
+		if (gtmpqCheckInBufferSpace(conn->inCursor + (size_t) msgLength,
+								 conn))
+		{
+			/*
+			 * XXX add some better recovery code... plan is to skip over
+			 * the message using its length, then report an error. For the
+			 * moment, just treat this like loss of sync (which indeed it
+			 * might be!)
+			 */
+			handleSyncLoss(conn, id, msgLength);
+		}
+		return NULL;
+	}
+
+	switch (id)
+	{
+		case 'S':		/* command complete */
+			if (gtmpqParseSuccess(conn, result))
+				return NULL;
+			break;
+
+		case 'E':		/* error return */
+			if (gtmpqGetError(conn, result))
+				return NULL;
+			result->gr_status = -1;	
+			break;
+		default:
+			printfGTMPQExpBuffer(&conn->errorMessage,
+							  "unexpected response from server; first received character was \"%c\"\n",
+							  id);
+			conn->inCursor += msgLength;
+			break;
+	}					/* switch on protocol character */
+	/* Successfully consumed this message */
+	if (conn->inCursor == conn->inStart + 5 + msgLength)
+	{
+		/* Normal case: parsing agrees with specified length */
+		conn->inStart = conn->inCursor;
+	}
+	else
+	{
+		/* Trouble --- report it */
+		printfGTMPQExpBuffer(&conn->errorMessage,
+						  "message contents do not agree with length in message type \"%c\"\n",
+						  id);
+		/* trust the specified message length as what to skip */
+		conn->inStart += 5 + msgLength;
+	}
+
+	return result;
+}
+
+/*
+ * handleSyncLoss: clean up after loss of message-boundary sync
+ *
+ * There isn't really a lot we can do here except abandon the connection.
+ */
+static void
+handleSyncLoss(GTM_Conn *conn, char id, int msgLength)
+{
+	printfGTMPQExpBuffer(&conn->errorMessage,
+	"lost synchronization with server: got message type \"%c\", length %d\n",
+					  id, msgLength);
+	close(conn->sock);
+	conn->sock = -1;
+	conn->status = CONNECTION_BAD;		/* No more connection to backend */
+}
+
+/*
+ * Attempt to read an Error or Notice response message.
+ * This is possible in several places, so we break it out as a subroutine.
+ * Entry: 'E' message type and length have already been consumed.
+ * Exit: returns 0 if successfully consumed message.
+ *		 returns EOF if not enough data.
+ */
+int
+gtmpqGetError(GTM_Conn *conn, GTM_Result *result)
+{
+	char		id;
+
+	/*
+	 * If we are a GTM proxy, expect an additional proxy header in the incoming
+	 * message.
+	 */
+	if (conn->is_proxy)
+	{
+		if (gtmpqGetnchar((char *)&result->gr_proxyhdr,
+					sizeof (GTM_ProxyMsgHeader), conn))
+			return 1;
+		result->gr_msglen -= sizeof (GTM_ProxyMsgHeader);
+
+		/*
+		 * If the allocated buffer is not large enough to hold the proxied
+		 * data, realloc the buffer.
+		 *
+		 * Since the client side code is shared between the proxy and the
+		 * backend, we don't want any memory context management etc here. So
+		 * just use plain realloc. Anyways, we don't indent to free the memory.
+		 */
+		if (result->gr_proxy_datalen < result->gr_msglen)
+		{
+			result->gr_proxy_data = (char *)realloc(
+					result->gr_proxy_data, result->gr_msglen);
+			result->gr_proxy_datalen = result->gr_msglen;
+		}
+
+		if (gtmpqGetnchar((char *)result->gr_proxy_data,
+					result->gr_msglen, conn))
+		{
+			result->gr_status = 1;
+			return 1;
+		}
+
+		return 0;
+	}
+	else
+		result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+	/*
+	 * Read the fields and save into res.
+	 */
+	for (;;)
+	{
+		if (gtmpqGetc(&id, conn))
+			goto fail;
+		if (id == '\0')
+			break;
+		if (gtmpqGets(&conn->errorMessage, conn))
+			goto fail;
+	}
+	return 0;
+
+fail:
+	return EOF;
+}
+
+/*
+ * GTMPQgetResult
+ *	  Get the next GTM_Result produced.  Returns NULL if no
+ *	  query work remains or an error has occurred (e.g. out of
+ *	  memory).
+ */
+
+GTM_Result *
+GTMPQgetResult(GTM_Conn *conn)
+{
+	GTM_Result *res;
+
+	if (!conn)
+		return NULL;
+
+	/* Parse any available data, if our state permits. */
+	while ((res = pqParseInput(conn)) == NULL)
+	{
+		int			flushResult;
+
+		/*
+		 * If data remains unsent, send it.  Else we might be waiting for the
+		 * result of a command the backend hasn't even got yet.
+		 */
+		while ((flushResult = gtmpqFlush(conn)) > 0)
+		{
+			if (gtmpqWait(false, true, conn))
+			{
+				flushResult = -1;
+				break;
+			}
+		}
+
+		/* Wait for some more data, and load it. */
+		if (flushResult ||
+			gtmpqWait(true, false, conn) ||
+			gtmpqReadData(conn) < 0)
+		{
+			/*
+			 * conn->errorMessage has been set by gtmpqWait or gtmpqReadData.
+			 */
+			return NULL;
+		}
+	}
+
+	return res;
+}
+
+static int
+gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
+{
+	int xcnt, xsize;
+	GlobalTransactionId *xip = NULL;
+
+	result->gr_status = 0;
+
+	if (gtmpqGetInt((int *)&result->gr_type, 4, conn))
+		return 1;
+	result->gr_msglen -= 4;
+
+	if (conn->is_proxy)
+	{
+		if (gtmpqGetnchar((char *)&result->gr_proxyhdr,
+					sizeof (GTM_ProxyMsgHeader), conn))
+			return 1;
+		result->gr_msglen -= sizeof (GTM_ProxyMsgHeader);
+	}
+	else
+		result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+	/*
+	 * If we are dealing with a proxied message, just read the remaining binary
+	 * data which can then be forwarded to the right backend.
+	 */
+	if (result->gr_proxyhdr.ph_conid != InvalidGTMProxyConnID)
+	{
+		/*
+		 * If the allocated buffer is not large enough to hold the proxied
+		 * data, realloc the buffer.
+		 *
+		 * Since the client side code is shared between the proxy and the
+		 * backend, we don't want any memory context management etc here. So
+		 * just use plain realloc. Anyways, we don't indent to free the memory.
+		 */
+		if (result->gr_proxy_datalen < result->gr_msglen)
+		{
+			result->gr_proxy_data = (char *)realloc(
+					result->gr_proxy_data, result->gr_msglen);
+			result->gr_proxy_datalen = result->gr_msglen;
+		}
+
+		if (gtmpqGetnchar((char *)result->gr_proxy_data,
+					result->gr_msglen, conn))
+		{
+			result->gr_status = 1;
+			return 1;
+		}
+
+		return result->gr_status;
+	}
+
+	result->gr_status = 0;
+
+	switch (result->gr_type)
+	{
+		case TXN_BEGIN_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txnhandle,
+						   sizeof (GTM_TransactionHandle), conn))
+				result->gr_status = -1;
+			break;
+
+		case TXN_BEGIN_GETGXID_RESULT:
+		case TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT:
+		case TXN_PREPARE_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid,
+						   sizeof (GlobalTransactionId), conn))
+				result->gr_status = -1;
+				break;
+
+		case TXN_COMMIT_RESULT:
+		case TXN_ROLLBACK_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			break;
+
+		case TXN_GET_GXID_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle,
+						   sizeof (GTM_TransactionHandle), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid,
+						   sizeof (GlobalTransactionId), conn))
+				result->gr_status = -1;
+			break;
+
+		case TXN_BEGIN_GETGXID_MULTI_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.txn_count,
+						   sizeof (int), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.start_gxid,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			break;
+
+
+		case TXN_COMMIT_MULTI_RESULT:
+		case TXN_ROLLBACK_MULTI_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_rc_multi.txn_count,
+						   sizeof (int), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_rc_multi.status,
+						   sizeof (int) * result->gr_resdata.grd_txn_rc_multi.txn_count, conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			break;
+
+		case SNAPSHOT_GXID_GET_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle,
+						   sizeof (GTM_TransactionHandle), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			/* Fall through */
+		case SNAPSHOT_GET_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			/* Fall through */
+		case SNAPSHOT_GET_MULTI_RESULT:
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_snap_multi.txn_count,
+						   sizeof (int), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_snap_multi.status,
+						   sizeof (int) * result->gr_resdata.grd_txn_snap_multi.txn_count, conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmin,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmax,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_recent_global_xmin,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+
+			if (gtmpqGetInt(&result->gr_snapshot.sn_xcnt,
+						   sizeof (GlobalTransactionId), conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			xsize = result->gr_xip_size;
+			xcnt = result->gr_snapshot.sn_xcnt;
+			xip = result->gr_snapshot.sn_xip;
+			
+			if ((xip == NULL) || (xcnt > xsize))
+			{
+				xip = (GlobalTransactionId *) realloc(xip, sizeof (GlobalTransactionId) * xcnt);
+				result->gr_snapshot.sn_xip = xip;
+				result->gr_xip_size = xcnt;
+			}
+			
+			if (gtmpqGetnchar((char *)xip, sizeof (GlobalTransactionId) * xcnt, conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+
+			break;
+
+		case SEQUENCE_INIT_RESULT:
+		case SEQUENCE_RESET_RESULT:
+		case SEQUENCE_CLOSE_RESULT:
+			if (gtmpqReadSeqKey(&result->gr_resdata.grd_seqkey, conn))
+				result->gr_status = -1;
+			break;
+
+		case SEQUENCE_GET_CURRENT_RESULT:
+		case SEQUENCE_GET_NEXT_RESULT:
+			if (gtmpqReadSeqKey(&result->gr_resdata.grd_seq.seqkey, conn))
+			{
+				result->gr_status = -1;
+				break;
+			}
+			if (gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.seqval,
+						   sizeof (GTM_Sequence), conn))
+				result->gr_status = -1;
+			break;
+
+		case TXN_GET_STATUS_RESULT:
+			break;
+
+		case TXN_GET_ALL_PREPARED_RESULT:
+			break;
+
+		default:
+			printfGTMPQExpBuffer(&conn->errorMessage,
+							  "unexpected result type from server; result typr was \"%d\"\n",
+							  result->gr_type);
+			result->gr_status = -1;
+			break;
+	}
+
+	return (result->gr_status);
+}
+
+static int
+gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn)
+{
+	/*
+	 * Read keylength
+	 */
+	if (gtmpqGetInt(&seqkey->gsk_keylen, 4, conn))
+		return EINVAL;
+
+	/*
+	 * Do some sanity checks on the keylength
+	 */
+	if (seqkey->gsk_keylen <= 0 || seqkey->gsk_keylen > GTM_MAX_SEQKEY_LENGTH)
+		return EINVAL;
+	
+	if ((seqkey->gsk_key = (char *) malloc(seqkey->gsk_keylen))	== NULL)
+		return ENOMEM;
+
+	if (gtmpqGetnchar(seqkey->gsk_key, seqkey->gsk_keylen, conn))
+		return EINVAL;
+
+	return 0;
+}
+
+void
+gtmpqFreeResultData(GTM_Result *result, bool is_proxy)
+{
+	/*
+	 * If we are running as a GTM proxy, we don't have anything to do. This may
+	 * change though as we add more message types below and some of them may
+	 * need cleanup even at the proxy level
+	 */
+	if (is_proxy)
+		return;
+
+	switch (result->gr_type)
+	{
+		case SEQUENCE_INIT_RESULT:
+		case SEQUENCE_RESET_RESULT:
+		case SEQUENCE_CLOSE_RESULT:
+			if (result->gr_resdata.grd_seqkey.gsk_key != NULL)
+				free(result->gr_resdata.grd_seqkey.gsk_key);
+			result->gr_resdata.grd_seqkey.gsk_key = NULL;
+			break;
+
+		case SEQUENCE_GET_CURRENT_RESULT:
+		case SEQUENCE_GET_NEXT_RESULT:
+			if (result->gr_resdata.grd_seq.seqkey.gsk_key != NULL)
+				free(result->gr_resdata.grd_seq.seqkey.gsk_key);
+			result->gr_resdata.grd_seqkey.gsk_key = NULL;
+			break;
+
+		case TXN_GET_STATUS_RESULT:
+			break;
+
+		case TXN_GET_ALL_PREPARED_RESULT:
+			break;
+
+		case SNAPSHOT_GET_RESULT:
+		case SNAPSHOT_GXID_GET_RESULT:
+			/*
+			 * Lets not free the xip array in the snapshot since we may need it
+			 * again shortly
+			 */
+			break;
+
+		default:
+			break;
+	}
+}
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
new file mode 100644
index 0000000000..6b22a81c53
--- /dev/null
+++ b/src/gtm/client/gtm_client.c
@@ -0,0 +1,515 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm-client.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+/* Time in seconds to wait for a response from GTM */
+/* We should consider making this a GUC */
+#define CLIENT_GTM_TIMEOUT 20
+
+#include <time.h>
+
+#include "gtm/gtm_c.h"
+
+#include "gtm/libpq-fe.h"
+#include "gtm/libpq-int.h"
+
+#include "gtm/gtm_client.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/assert.h"
+
+void GTM_FreeResult(GTM_Result *result, bool is_proxy);
+
+/*
+ * Connection Management API
+ */
+GTM_Conn *
+connect_gtm(const char *connect_string)
+{
+	return PQconnectGTM(connect_string);
+}
+
+void
+disconnect_gtm(GTM_Conn *conn)
+{
+	GTMPQfinish(conn);
+}
+
+/*
+ * Transaction Management API
+ */
+GlobalTransactionId
+begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel)
+{
+	bool txn_read_only = false;
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_TXN_BEGIN_GETGXID, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) ||
+		gtmpqPutc(txn_read_only, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+		return res->gr_resdata.grd_gxid;
+	else
+		return InvalidGlobalTransactionId;
+
+receive_failed:
+send_failed:
+	return InvalidGlobalTransactionId;
+}
+
+/*
+ * Transaction Management API
+ * Begin a transaction for an autovacuum worker process
+ */
+GlobalTransactionId
+begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel)
+{
+	bool txn_read_only = false;
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) ||
+		gtmpqPutc(txn_read_only, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+		return res->gr_resdata.grd_gxid;
+	else
+		return InvalidGlobalTransactionId;
+
+receive_failed:
+send_failed:
+	return InvalidGlobalTransactionId;
+}
+int
+commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_TXN_COMMIT, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutc(true, conn) ||
+		gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+	{
+		Assert(res->gr_type == TXN_COMMIT_RESULT);
+		Assert(res->gr_resdata.grd_gxid == gxid);
+	}
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+
+}
+
+int 
+abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_TXN_ROLLBACK, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutc(true, conn) ||
+		gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+	{
+		Assert(res->gr_type == TXN_ROLLBACK_RESULT);
+		Assert(res->gr_resdata.grd_gxid == gxid);
+	}
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+
+}
+
+int
+prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
+					int nodecnt, PGXC_NodeId nodes[])
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_TXN_PREPARE, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutc(true, conn) ||
+		gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) ||
+		gtmpqPutInt(nodecnt, sizeof (int), conn) ||
+		gtmpqPutnchar((char *)nodes, sizeof (PGXC_NodeId) * nodecnt, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+	{
+		Assert(res->gr_type == TXN_PREPARE_RESULT);
+		Assert(res->gr_resdata.grd_gxid == gxid);
+	}
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+/*
+ * Snapshot Management API
+ */
+GTM_SnapshotData *
+get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SNAPSHOT_GET, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutc(canbe_grouped, conn) ||
+		gtmpqPutc(true, conn) ||
+		gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+	{
+		Assert(res->gr_type == SNAPSHOT_GET_RESULT);
+		Assert(res->gr_resdata.grd_txn.gxid == gxid);
+		return &(res->gr_snapshot);
+	}
+	else
+		return NULL;
+
+
+receive_failed:
+send_failed:
+	return NULL;
+}
+
+/*
+ * Sequence Management API
+ */
+int
+open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
+			  GTM_Sequence minval, GTM_Sequence maxval,
+			  GTM_Sequence startval, bool cycle)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SEQUENCE_INIT, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+		gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) ||
+		gtmpqPutnchar((char *)&increment, sizeof (GTM_Sequence), conn) ||
+		gtmpqPutnchar((char *)&minval, sizeof (GTM_Sequence), conn) ||
+		gtmpqPutnchar((char *)&maxval, sizeof (GTM_Sequence), conn) ||
+		gtmpqPutnchar((char *)&startval, sizeof (GTM_Sequence), conn) ||
+		gtmpqPutc(cycle, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+int
+close_sequence(GTM_Conn *conn, GTM_SequenceKey key)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SEQUENCE_CLOSE, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+		gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+GTM_Sequence
+get_current(GTM_Conn *conn, GTM_SequenceKey key)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+		gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+		return res->gr_resdata.grd_seq.seqval;
+	else
+		return InvalidSequenceValue;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+GTM_Sequence
+get_next(GTM_Conn *conn, GTM_SequenceKey key)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+		gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	if (res->gr_status == 0)
+		return res->gr_resdata.grd_seq.seqval;
+	else
+		return InvalidSequenceValue;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+int
+reset_sequence(GTM_Conn *conn, GTM_SequenceKey key)
+{
+	GTM_Result *res = NULL;
+	time_t finish_time;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, conn) ||
+		gtmpqPutInt(MSG_SEQUENCE_RESET, sizeof (GTM_MessageType), conn) ||
+		gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+		gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+		goto send_failed;
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(conn))
+		goto send_failed;
+
+	/* Flush to ensure backend gets it. */
+	if (gtmpqFlush(conn))
+		goto send_failed;
+
+	finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+	if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+		gtmpqReadData(conn) < 0)
+		goto receive_failed;
+
+	if ((res = GTMPQgetResult(conn)) == NULL)
+		goto receive_failed;
+
+	return res->gr_status;
+
+receive_failed:
+send_failed:
+	return -1;
+}
+
+void
+GTM_FreeResult(GTM_Result *result, bool is_proxy)
+{
+	if (result == NULL)
+		return;
+	gtmpqFreeResultData(result, is_proxy);
+	free(result);
+}
diff --git a/src/gtm/client/ip.c b/src/gtm/client/ip.c
new file mode 100644
index 0000000000..b210e201c5
--- /dev/null
+++ b/src/gtm/client/ip.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.c
+ *	  IPv6-aware network access.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $
+ *
+ * This file and the IPV6 implementation were initially provided by
+ * Nigel Kukard <[email protected]>, Linux Based Systems Design
+ * http://www.lbsd.net.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* This is intended to be used in both frontend and backend, so use c.h */
+#include "gtm/gtm_c.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#include <sys/file.h>
+
+#include "gtm/gtm_ip.h"
+
+
+static int range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+					   const struct sockaddr_in * netaddr,
+					   const struct sockaddr_in * netmask);
+
+#ifdef HAVE_IPV6
+static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+						const struct sockaddr_in6 * netaddr,
+						const struct sockaddr_in6 * netmask);
+#endif
+
+
+/*
+ *	gtm_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
+ */
+int
+gtm_getaddrinfo_all(const char *hostname, const char *servname,
+				   const struct addrinfo * hintp, struct addrinfo ** result)
+{
+	int			rc;
+
+	/* not all versions of getaddrinfo() zero *result on failure */
+	*result = NULL;
+
+	/* NULL has special meaning to getaddrinfo(). */
+	rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
+					 servname, hintp, result);
+
+	return rc;
+}
+
+
+/*
+ *	gtm_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix
+ *
+ * Note: the ai_family field of the original hint structure must be passed
+ * so that we can tell whether the addrinfo struct was built by the system's
+ * getaddrinfo() routine or our own getaddrinfo_unix() routine.  Some versions
+ * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's
+ * not safe to look at ai_family in the addrinfo itself.
+ */
+void
+gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai)
+{
+	{
+		/* struct was built by getaddrinfo() */
+		if (ai != NULL)
+			freeaddrinfo(ai);
+	}
+}
+
+
+/*
+ *	gtm_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets
+ *
+ * The API of this routine differs from the standard getnameinfo() definition
+ * in two ways: first, the addr parameter is declared as sockaddr_storage
+ * rather than struct sockaddr, and second, the node and service fields are
+ * guaranteed to be filled with something even on failure return.
+ */
+int
+gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+				   char *node, int nodelen,
+				   char *service, int servicelen,
+				   int flags)
+{
+	int			rc;
+
+	rc = getnameinfo((const struct sockaddr *) addr, salen,
+					 node, nodelen,
+					 service, servicelen,
+					 flags);
+
+	if (rc != 0)
+	{
+		if (node)
+			strlcpy(node, "???", nodelen);
+		if (service)
+			strlcpy(service, "???", servicelen);
+	}
+
+	return rc;
+}
+
+/*
+ * gtm_range_sockaddr - is addr within the subnet specified by netaddr/netmask ?
+ *
+ * Note: caller must already have verified that all three addresses are
+ * in the same address family; and AF_UNIX addresses are not supported.
+ */
+int
+gtm_range_sockaddr(const struct sockaddr_storage * addr,
+				  const struct sockaddr_storage * netaddr,
+				  const struct sockaddr_storage * netmask)
+{
+	if (addr->ss_family == AF_INET)
+		return range_sockaddr_AF_INET((struct sockaddr_in *) addr,
+									  (struct sockaddr_in *) netaddr,
+									  (struct sockaddr_in *) netmask);
+#ifdef HAVE_IPV6
+	else if (addr->ss_family == AF_INET6)
+		return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr,
+									   (struct sockaddr_in6 *) netaddr,
+									   (struct sockaddr_in6 *) netmask);
+#endif
+	else
+		return 0;
+}
+
+static int
+range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+					   const struct sockaddr_in * netaddr,
+					   const struct sockaddr_in * netmask)
+{
+	if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) &
+		 netmask->sin_addr.s_addr) == 0)
+		return 1;
+	else
+		return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+static int
+range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+						const struct sockaddr_in6 * netaddr,
+						const struct sockaddr_in6 * netmask)
+{
+	int			i;
+
+	for (i = 0; i < 16; i++)
+	{
+		if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) &
+			 netmask->sin6_addr.s6_addr[i]) != 0)
+			return 0;
+	}
+
+	return 1;
+}
+#endif   /* HAVE_IPV6 */
+
+/*
+ *	gtm_sockaddr_cidr_mask - make a network mask of the appropriate family
+ *	  and required number of significant bits
+ *
+ * The resulting mask is placed in *mask, which had better be big enough.
+ *
+ * Return value is 0 if okay, -1 if not.
+ */
+int
+gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family)
+{
+	long		bits;
+	char	   *endptr;
+
+	bits = strtol(numbits, &endptr, 10);
+
+	if (*numbits == '\0' || *endptr != '\0')
+		return -1;
+
+	switch (family)
+	{
+		case AF_INET:
+			{
+				struct sockaddr_in mask4;
+				long		maskl;
+
+				if (bits < 0 || bits > 32)
+					return -1;
+				/* avoid "x << 32", which is not portable */
+				if (bits > 0)
+					maskl = (0xffffffffUL << (32 - (int) bits))
+						& 0xffffffffUL;
+				else
+					maskl = 0;
+				mask4.sin_addr.s_addr = htonl(maskl);
+				memcpy(mask, &mask4, sizeof(mask4));
+				break;
+			}
+
+#ifdef HAVE_IPV6
+		case AF_INET6:
+			{
+				struct sockaddr_in6 mask6;
+				int			i;
+
+				if (bits < 0 || bits > 128)
+					return -1;
+				for (i = 0; i < 16; i++)
+				{
+					if (bits <= 0)
+						mask6.sin6_addr.s6_addr[i] = 0;
+					else if (bits >= 8)
+						mask6.sin6_addr.s6_addr[i] = 0xff;
+					else
+					{
+						mask6.sin6_addr.s6_addr[i] =
+							(0xff << (8 - (int) bits)) & 0xff;
+					}
+					bits -= 8;
+				}
+				memcpy(mask, &mask6, sizeof(mask6));
+				break;
+			}
+#endif
+		default:
+			return -1;
+	}
+
+	mask->ss_family = family;
+	return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+/*
+ * gtm_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using
+ *		the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result!  Note that we only worry about setting the fields
+ * that gtm_range_sockaddr will look at.
+ */
+void
+gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr)
+{
+	struct sockaddr_in addr4;
+	struct sockaddr_in6 addr6;
+	uint32		ip4addr;
+
+	memcpy(&addr4, addr, sizeof(addr4));
+	ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+	memset(&addr6, 0, sizeof(addr6));
+
+	addr6.sin6_family = AF_INET6;
+
+	addr6.sin6_addr.s6_addr[10] = 0xff;
+	addr6.sin6_addr.s6_addr[11] = 0xff;
+	addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+	addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+	addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+	addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+	memcpy(addr, &addr6, sizeof(addr6));
+}
+
+/*
+ * gtm_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using
+ *		the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * This must be different from gtm_promote_v4_to_v6_addr because we want to
+ * set the high-order bits to 1's not 0's.
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result!  Note that we only worry about setting the fields
+ * that gtm_range_sockaddr will look at.
+ */
+void
+gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
+{
+	struct sockaddr_in addr4;
+	struct sockaddr_in6 addr6;
+	uint32		ip4addr;
+	int			i;
+
+	memcpy(&addr4, addr, sizeof(addr4));
+	ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+	memset(&addr6, 0, sizeof(addr6));
+
+	addr6.sin6_family = AF_INET6;
+
+	for (i = 0; i < 12; i++)
+		addr6.sin6_addr.s6_addr[i] = 0xff;
+
+	addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+	addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+	addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+	addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+	memcpy(addr, &addr6, sizeof(addr6));
+}
+
+#endif   /* HAVE_IPV6 */
diff --git a/src/gtm/client/pqexpbuffer.c b/src/gtm/client/pqexpbuffer.c
new file mode 100644
index 0000000000..95c6ee09ee
--- /dev/null
+++ b/src/gtm/client/pqexpbuffer.c
@@ -0,0 +1,373 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqexpbuffer.c
+ *
+ * PQExpBuffer provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data.  All storage is allocated with malloc().
+ *
+ * This module is essentially the same as the backend's StringInfo data type,
+ * but it is intended for use in frontend libpq and client applications.
+ * Thus, it does not rely on palloc() nor elog().
+ *
+ * It does rely on vsnprintf(); if configure finds that libc doesn't provide
+ * a usable vsnprintf(), then a copy of our own implementation of it will
+ * be linked into libpq.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.c,v 1.25 2008/11/26 00:26:23 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <limits.h>
+
+#include "gtm/pqexpbuffer.h"
+
+
+/* All "broken" PQExpBuffers point to this string. */
+static const char oom_buffer[1] = "";
+
+
+/*
+ * markPQExpBufferBroken
+ *
+ * Put a PQExpBuffer in "broken" state if it isn't already.
+ */
+static void
+markPQExpBufferBroken(PQExpBuffer str)
+{
+	if (str->data != oom_buffer)
+		free(str->data);
+	/*
+	 * Casting away const here is a bit ugly, but it seems preferable to
+	 * not marking oom_buffer const.  We want to do that to encourage the
+	 * compiler to put oom_buffer in read-only storage, so that anyone who
+	 * tries to scribble on a broken PQExpBuffer will get a failure.
+	 */
+	str->data = (char *) oom_buffer;
+	str->len = 0;
+	str->maxlen = 0;
+}
+
+/*
+ * createGTMPQExpBuffer
+ *
+ * Create an empty 'PQExpBufferData' & return a pointer to it.
+ */
+PQExpBuffer
+createGTMPQExpBuffer(void)
+{
+	PQExpBuffer res;
+
+	res = (PQExpBuffer) malloc(sizeof(PQExpBufferData));
+	if (res != NULL)
+		initGTMPQExpBuffer(res);
+
+	return res;
+}
+
+/*
+ * initGTMPQExpBuffer
+ *
+ * Initialize a PQExpBufferData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+void
+initGTMPQExpBuffer(PQExpBuffer str)
+{
+	str->data = (char *) malloc(INITIAL_EXPBUFFER_SIZE);
+	if (str->data == NULL)
+	{
+		str->data = (char *) oom_buffer;		/* see comment above */
+		str->maxlen = 0;
+		str->len = 0;
+	}
+	else
+	{
+		str->maxlen = INITIAL_EXPBUFFER_SIZE;
+		str->len = 0;
+		str->data[0] = '\0';
+	}
+}
+
+/*
+ * destroyGTMPQExpBuffer(str);
+ *
+ *		free()s both the data buffer and the PQExpBufferData.
+ *		This is the inverse of createGTMPQExpBuffer().
+ */
+void
+destroyGTMPQExpBuffer(PQExpBuffer str)
+{
+	if (str)
+	{
+		termGTMPQExpBuffer(str);
+		free(str);
+	}
+}
+
+/*
+ * termGTMPQExpBuffer(str)
+ *		free()s the data buffer but not the PQExpBufferData itself.
+ *		This is the inverse of initGTMPQExpBuffer().
+ */
+void
+termGTMPQExpBuffer(PQExpBuffer str)
+{
+	if (str->data != oom_buffer)
+		free(str->data);
+	/* just for luck, make the buffer validly empty. */
+	str->data = (char *) oom_buffer;		/* see comment above */
+	str->maxlen = 0;
+	str->len = 0;
+}
+
+/*
+ * resetGTMPQExpBuffer
+ *		Reset a PQExpBuffer to empty
+ *
+ * Note: if possible, a "broken" PQExpBuffer is returned to normal.
+ */
+void
+resetGTMPQExpBuffer(PQExpBuffer str)
+{
+	if (str)
+	{
+		if (str->data != oom_buffer)
+		{
+			str->len = 0;
+			str->data[0] = '\0';
+		}
+		else
+		{
+			/* try to reinitialize to valid state */
+			initGTMPQExpBuffer(str);
+		}
+	}
+}
+
+/*
+ * enlargeGTMPQExpBuffer
+ * Make sure there is enough space for 'needed' more bytes in the buffer
+ * ('needed' does not include the terminating null).
+ *
+ * Returns 1 if OK, 0 if failed to enlarge buffer.  (In the latter case
+ * the buffer is left in "broken" state.)
+ */
+int
+enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed)
+{
+	size_t		newlen;
+	char	   *newdata;
+
+	if (PQExpBufferBroken(str))
+		return 0;				/* already failed */
+
+	/*
+	 * Guard against ridiculous "needed" values, which can occur if we're fed
+	 * bogus data.	Without this, we can get an overflow or infinite loop in
+	 * the following.
+	 */
+	if (needed >= ((size_t) INT_MAX - str->len))
+	{
+		markPQExpBufferBroken(str);
+		return 0;
+	}
+
+	needed += str->len + 1;		/* total space required now */
+
+	/* Because of the above test, we now have needed <= INT_MAX */
+
+	if (needed <= str->maxlen)
+		return 1;				/* got enough space already */
+
+	/*
+	 * We don't want to allocate just a little more space with each append;
+	 * for efficiency, double the buffer size each time it overflows.
+	 * Actually, we might need to more than double it if 'needed' is big...
+	 */
+	newlen = (str->maxlen > 0) ? (2 * str->maxlen) : 64;
+	while (needed > newlen)
+		newlen = 2 * newlen;
+
+	/*
+	 * Clamp to INT_MAX in case we went past it.  Note we are assuming here
+	 * that INT_MAX <= UINT_MAX/2, else the above loop could overflow.	We
+	 * will still have newlen >= needed.
+	 */
+	if (newlen > (size_t) INT_MAX)
+		newlen = (size_t) INT_MAX;
+
+	newdata = (char *) realloc(str->data, newlen);
+	if (newdata != NULL)
+	{
+		str->data = newdata;
+		str->maxlen = newlen;
+		return 1;
+	}
+
+	markPQExpBufferBroken(str);
+	return 0;
+}
+
+/*
+ * printfGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and insert it into str.	More space is allocated to str if necessary.
+ * This is a convenience routine that does the same thing as
+ * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer().
+ */
+void
+printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+{
+	va_list		args;
+	size_t		avail;
+	int			nprinted;
+
+	resetGTMPQExpBuffer(str);
+
+	if (PQExpBufferBroken(str))
+		return;					/* already failed */
+
+	for (;;)
+	{
+		/*
+		 * Try to format the given string into the available space; but if
+		 * there's hardly any space, don't bother trying, just fall through to
+		 * enlarge the buffer first.
+		 */
+		if (str->maxlen > str->len + 16)
+		{
+			avail = str->maxlen - str->len - 1;
+			va_start(args, fmt);
+			nprinted = vsnprintf(str->data + str->len, avail,
+								 fmt, args);
+			va_end(args);
+
+			/*
+			 * Note: some versions of vsnprintf return the number of chars
+			 * actually stored, but at least one returns -1 on failure. Be
+			 * conservative about believing whether the print worked.
+			 */
+			if (nprinted >= 0 && nprinted < (int) avail - 1)
+			{
+				/* Success.  Note nprinted does not include trailing null. */
+				str->len += nprinted;
+				break;
+			}
+		}
+		/* Double the buffer size and try again. */
+		if (!enlargeGTMPQExpBuffer(str, str->maxlen))
+			return;				/* oops, out of memory */
+	}
+}
+
+/*
+ * appendGTMPQExpBuffer
+ *
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and append it to whatever is already in str.  More space is allocated
+ * to str if necessary.  This is sort of like a combination of sprintf and
+ * strcat.
+ */
+void
+appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+{
+	va_list		args;
+	size_t		avail;
+	int			nprinted;
+
+	if (PQExpBufferBroken(str))
+		return;					/* already failed */
+
+	for (;;)
+	{
+		/*
+		 * Try to format the given string into the available space; but if
+		 * there's hardly any space, don't bother trying, just fall through to
+		 * enlarge the buffer first.
+		 */
+		if (str->maxlen > str->len + 16)
+		{
+			avail = str->maxlen - str->len - 1;
+			va_start(args, fmt);
+			nprinted = vsnprintf(str->data + str->len, avail,
+								 fmt, args);
+			va_end(args);
+
+			/*
+			 * Note: some versions of vsnprintf return the number of chars
+			 * actually stored, but at least one returns -1 on failure. Be
+			 * conservative about believing whether the print worked.
+			 */
+			if (nprinted >= 0 && nprinted < (int) avail - 1)
+			{
+				/* Success.  Note nprinted does not include trailing null. */
+				str->len += nprinted;
+				break;
+			}
+		}
+		/* Double the buffer size and try again. */
+		if (!enlargeGTMPQExpBuffer(str, str->maxlen))
+			return;				/* oops, out of memory */
+	}
+}
+
+/*
+ * appendGTMPQExpBufferStr
+ * Append the given string to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+void
+appendGTMPQExpBufferStr(PQExpBuffer str, const char *data)
+{
+	appendBinaryGTMPQExpBuffer(str, data, strlen(data));
+}
+
+/*
+ * appendGTMPQExpBufferChar
+ * Append a single byte to str.
+ * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster.
+ */
+void
+appendGTMPQExpBufferChar(PQExpBuffer str, char ch)
+{
+	/* Make more room if needed */
+	if (!enlargeGTMPQExpBuffer(str, 1))
+		return;
+
+	/* OK, append the character */
+	str->data[str->len] = ch;
+	str->len++;
+	str->data[str->len] = '\0';
+}
+
+/*
+ * appendBinaryGTMPQExpBuffer
+ *
+ * Append arbitrary binary data to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+void
+appendBinaryGTMPQExpBuffer(PQExpBuffer str, const char *data, size_t datalen)
+{
+	/* Make more room if needed */
+	if (!enlargeGTMPQExpBuffer(str, datalen))
+		return;
+
+	/* OK, append the data */
+	memcpy(str->data + str->len, data, datalen);
+	str->len += datalen;
+
+	/*
+	 * Keep a trailing null in place, even though it's probably useless for
+	 * binary data...
+	 */
+	str->data[str->len] = '\0';
+}
diff --git a/src/gtm/client/strlcpy.c b/src/gtm/client/strlcpy.c
new file mode 100644
index 0000000000..ae031e244c
--- /dev/null
+++ b/src/gtm/client/strlcpy.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * strlcpy.c
+ *	  strncpy done right
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $
+ *
+ * This file was taken from OpenBSD and is used on platforms that don't
+ * provide strlcpy().  The OpenBSD copyright terms follow.
+ *-------------------------------------------------------------------------
+ */
+
+/*	$OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $	*/
+
+/*
+ * Copyright (c) 1998 Todd C. Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "gtm/gtm_c.h"
+
+
+/*
+ * Copy src to string dst of size siz.	At most siz-1 characters
+ * will be copied.	Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ * Function creation history:  http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+	char	   *d = dst;
+	const char *s = src;
+	size_t		n = siz;
+
+	/* Copy as many bytes as will fit */
+	if (n != 0)
+	{
+		while (--n != 0)
+		{
+			if ((*d++ = *s++) == '\0')
+				break;
+		}
+	}
+
+	/* Not enough room in dst, add NUL and traverse rest of src */
+	if (n == 0)
+	{
+		if (siz != 0)
+			*d = '\0';			/* NUL-terminate dst */
+		while (*s++)
+			;
+	}
+
+	return (s - src - 1);		/* count does not include NUL */
+}
diff --git a/src/gtm/client/test/Makefile b/src/gtm/client/test/Makefile
new file mode 100644
index 0000000000..46ddbe9a6a
--- /dev/null
+++ b/src/gtm/client/test/Makefile
@@ -0,0 +1,31 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../../../
+include $(top_build_dir)/gtm/Makefile.global
+
+override CPPFLAGS := -I$(top_build_dir)/gtm/client $(CPPFLAGS)
+
+OBJS=test_seq.o test_txn.o test_snap.o test_txnperf.o test_snapperf.o
+LIBS =-lpthread
+LOADLIBES=-lpthread
+CFLAGS=-g -O0
+
+all:test_txn test_seq test_snap test_txnperf test_snapperf
+
+test_txn:test_txn.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_seq:test_seq.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_snap:test_snap.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_txnperf:test_txnperf.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+test_snapperf:test_snapperf.o $(top_build_dir)/gtm/client/libgtmclient.a
+
+clean:
+	rm -f $(OBJS)
+	rm -f test_txn test_seq test_snap test_txnperf test_snapperf
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/client/test/test_proxy.sh b/src/gtm/client/test/test_proxy.sh
new file mode 100644
index 0000000000..c0d3caec61
--- /dev/null
+++ b/src/gtm/client/test/test_proxy.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+GTM_SERVER_HOSTNAME=gtm
+GTM_SERVER_PORT=16667
+
+GTM_PROXY_HOSTNAMES=(coordinator1 coordinator2 coordinator3 coordinator4 coordinator5)
+GTM_PROXY_PORTS=(16666 16666 16666 16666 16666)
+GTM_PROXY_COUNT=${#GTM_PROXY_HOSTNAMES[*]}
+
+PGXC_BASE=$HOME/pgsql_pgxc
+
+GTM_SERVER_PROCESS=gtm
+GTM_PROXY_PROCESS=gtm_proxy
+GTM_TEST_CLIENT_PROCESS=test_txnperf
+
+GTM_SERVER=$PGXC_BASE/src/gtm/main/$GTM_SERVER_PROCESS
+GTM_PROXY=$PGXC_BASE/src/gtm/proxy/$GTM_PROXY_PROCESS
+GTM_TEST_CLIENT=$PGXC_BASE/src/gtm/client/test/$GTM_TEST_CLIENT_PROCESS
+
+GTM_SERVER_LOG_FILE=/tmp/gtmlog
+GTM_SERVER_CONTROL_FILE=/tmp/gtmcontrol
+GTM_PROXY_LOG_FILE=/tmp/gtmptoxylog
+
+
+if [ "$#" -ne "5" ];
+then
+	echo "Usage: test_proxy.sh <test_gtm_proxy> <num_clients> <num_xacts> <num_stmts> <num_worker_threads>"
+	exit;
+fi
+
+TEST_GTM_PROXY=$1
+NUM_CLIENTS=$2
+NUM_XACTS=$3
+NUM_STMTS=$4
+NUM_THREADS=$5
+
+
+# Stop and kill any gtm server or proxy processes
+#
+ssh $GTM_SERVER_HOSTNAME "killall -9 $GTM_SERVER_PROCESS"
+
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_PROXY_PROCESS" > /dev/null 2>&1
+done
+
+echo "Killed stale server and proxies - sleeping for 5 seconds"
+sleep 5
+
+# Remove any stale log and control files
+#
+ssh $GTM_SERVER_HOSTNAME "rm -f $GTM_SERVER_LOG_FILE $GTM_SERVER_CONTROL_FILE"
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f ${GTM_PROXY_LOG_FILE}_$index"
+done
+
+# Create a output directoty to store all test related data
+#
+OUTPUT_DIR=output
+dir=`date "+%F-%H-%M-%S"`
+echo "Creating output directory $OUTPUT_DIR/$dir"
+mkdir -p $OUTPUT_DIR/$dir
+
+
+# Start the GTM server
+#
+echo "Starting GTM server at $GTM_SERVER_HOSTNAME on port $GTM_SERVER_PORT"
+ssh $GTM_SERVER_HOSTNAME "$GTM_SERVER -h $GTM_SERVER_HOSTNAME -p $GTM_SERVER_PORT -l $GTM_SERVER_LOG_FILE&"&
+
+echo "Sleeping for 3 seconds"
+sleep 3
+
+# Start the GTM proxy on all nodes
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	echo "Starting GTM proxy at ${GTM_PROXY_HOSTNAMES[$index]} on port ${GTM_PROXY_PORTS[$index]} - $NUM_THREADS worker threads"
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_PROXY -h ${GTM_PROXY_HOSTNAMES[$index]} -p ${GTM_PROXY_PORTS[$index]} -s $GTM_SERVER_HOSTNAME -t $GTM_SERVER_PORT -n $NUM_THREADS -l ${GTM_PROXY_LOG_FILE}_$index&"&
+done
+
+echo "Sleeping for 3 seconds"
+sleep 3
+
+# Kill all clients
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_TEST_CLIENT_PROCESS" > /dev/null 2>&1
+done
+
+echo "Killed all stale clients -- sleeping for 5 seconds"
+sleep 5
+
+# Remove any stale result files
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_OUTPUT_$index TEST_OUTPUT_$index.CSV TEST_END_$index"
+done
+
+# Write out some information about the test configuration
+#
+if ( $TEST_GTM_PROXY -eq true );
+then
+	echo "Testing GTM Proxy Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "Number of GTM Proxy Worker Threads $NUM_THREADS" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+else
+	echo "Testing GTM Server Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+	echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+fi
+
+# Start the stats collection scripts . Kill any stale commands and remove the old files first
+#
+ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "vmstat 1 > TEST_VMSTATS_GTM&"&
+
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "vmstat 1 > TEST_VMSTATS_$index&"&
+done
+
+# Start the clients
+#
+rm -f TEST_END*
+
+echo "Starting clients"
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	if ( $TEST_GTM_PROXY -eq true );
+	then
+		SERVER_HOSTNAME=${GTM_PROXY_HOSTNAMES[$index]};
+		SERVER_PORT=${GTM_PROXY_PORTS[$index]};
+	else
+		SERVER_HOSTNAME=$GTM_SERVER_HOSTNAME;
+		SERVER_PORT=$GTM_SERVER_PORT;
+	fi
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_TEST_CLIENT -h $SERVER_HOSTNAME -p $SERVER_PORT -c $NUM_CLIENTS -n $NUM_XACTS -s $NUM_STMTS -i $index &"&
+done
+
+# Wait for all the clients to finish
+#
+while (true)
+do
+	all_done=true
+	for index in ${!GTM_PROXY_HOSTNAMES[*]}
+	do
+		scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_END_$index . > /dev/null 2>&1
+		if ! [ -f TEST_END_$index ];
+		then
+			all_done=false;
+		fi;
+	done
+
+	if ( $all_done -eq true ); then break; fi
+	sleep 5;
+done
+
+echo "All clients finished"
+
+# Copy GTM server log files
+#
+scp $GTM_SERVER_HOSTNAME:$GTM_SERVER_LOG_FILE $OUTPUT_DIR/$dir > /dev/null 2>&1
+
+# Copy GTM server vmstat file
+scp $GTM_SERVER_HOSTNAME:TEST_VMSTATS_GTM $OUTPUT_DIR/$dir > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1
+ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1
+
+# Copy GTM Proxy log file and the results
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+	scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index.CSV $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+	scp ${GTM_PROXY_HOSTNAMES[$index]}:${GTM_PROXY_LOG_FILE}_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+	scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_VMSTATS_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1
+	ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1
+done
+
+# Paste the result in the summary file
+#
+for index in ${!GTM_PROXY_HOSTNAMES[*]}
+do
+	cat $OUTPUT_DIR/$dir/TEST_OUTPUT_$index >> $OUTPUT_DIR/$dir/TEST_SUMMARY
+done
+
+echo "Done"
diff --git a/src/gtm/client/test/test_seq.c b/src/gtm/client/test/test_seq.c
new file mode 100644
index 0000000000..da0ed91ee2
--- /dev/null
+++ b/src/gtm/client/test/test_seq.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x)	printf x
+
+int
+main(int argc, char *argv[])
+{
+	int ii;
+	pid_t parent_pid;
+
+	GTM_Conn *conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+	if (conn == NULL)
+	{
+		client_log(("Error in connection"));
+		exit(1);
+	}
+
+	parent_pid = getpid();
+
+	/*
+	 * Create sequences
+	 */
+	for (ii = 0; ii < 20; ii++)
+	{
+		char buf[100];
+		GTM_SequenceKeyData seqkey;
+		sprintf(buf, "%d:%d", ii, ii);
+		seqkey.gsk_keylen = strlen(buf);
+		seqkey.gsk_key = buf;
+		if (open_sequence(conn, &seqkey, 10, 1, 10000, 100, false))
+		   client_log(("Open seq failed\n"));	
+		else
+			client_log(("Opened Sequence %s\n", seqkey.gsk_key));
+	}
+
+	/*
+	 * Close the GTM connection
+	 */
+	GTMPQfinish(conn);
+
+	/*
+	 * Start few process which would independently use the sequences
+	 */
+	for (ii = 0; ii < 3; ii++)
+		fork();
+
+	/*
+	 * Each process now opens a new connection with the GTM
+	 */
+	conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+
+	/*
+	 * Try to read/increment the sequence
+	 */
+	for (ii = 0; ii < 20; ii++)
+	{
+		char buf[100];
+		GTM_SequenceKeyData seqkey;
+		GTM_Sequence seqval;
+		int jj;
+
+		sprintf(buf, "%d:%d", ii, ii);
+		seqkey.gsk_keylen = strlen(buf);
+		seqkey.gsk_key = buf;
+		if ((seqval = get_current(conn, &seqkey)) == InvalidSequenceValue)
+			client_log(("get_current seq failed for sequene %s\n", seqkey.gsk_key));
+		else
+			client_log(("CURRENT SEQVAL(%s): %lld\n", seqkey.gsk_key, seqval));
+		
+		for (jj = 0; jj < 5; jj++)
+		{
+			if ((seqval = get_next(conn, &seqkey)) == InvalidSequenceValue)
+				client_log(("get_current seq failed for sequence %s\n", seqkey.gsk_key));
+			else
+				client_log(("NEXT SEQVAL(%s): %lld ", seqkey.gsk_key, seqval));
+		}
+		client_log(("\n"));
+	}
+
+	/*
+	 * The main process now closes the sequences. We want to call close only
+	 * once, hence this approach
+	 */
+	if (getpid() == parent_pid)
+	{
+		/*
+		 * Wait long enough so that all other processes are done
+		 */
+		sleep(20);
+		for (ii = 0; ii < 20; ii++)
+		{
+			char buf[100];
+			GTM_SequenceKeyData seqkey;
+			sprintf(buf, "%d:%d", ii, ii);
+			seqkey.gsk_keylen = strlen(buf);
+			seqkey.gsk_key = buf;
+			if (close_sequence(conn, &seqkey))
+				client_log(("Close seq failed for sequence %s\n", seqkey.gsk_key));
+			else
+				client_log(("Sequene closed %s\n", seqkey.gsk_key));
+		}
+	}
+	GTMPQfinish(conn);
+	return 0;
+}
diff --git a/src/gtm/client/test/test_snap.c b/src/gtm/client/test/test_snap.c
new file mode 100644
index 0000000000..a2ce2f965a
--- /dev/null
+++ b/src/gtm/client/test/test_snap.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x)	printf x
+
+int
+main(int argc, char *argv[])
+{
+	int ii;
+	GlobalTransactionId gxid[4000];
+	GTM_Conn *conn;
+
+	for (ii = 0; ii < 3; ii++)
+		fork();
+
+	conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+	if (conn == NULL)
+	{
+		client_log(("Error in connection\n"));
+		exit(1);
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+		if (gxid[ii] != InvalidGlobalTransactionId)
+			client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+		else
+			client_log(("BEGIN transaction failed for ii=%d\n", ii));
+	}
+
+	for (ii = 0; ii < 5; ii++)
+	{
+		int jj;
+		GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true);
+		if (snapshot != NULL)
+		{
+			client_log(("Snapshot: GXID %u, xmin=%u, xmax=%u\n", gxid[ii],
+					snapshot->sn_xmin, snapshot->sn_xmax));
+			client_log(("xcnt=%d %s", snapshot->sn_xcnt,
+					snapshot->sn_xcnt > 0 ? "xip=(" : ""));
+			for (jj = 0; jj < snapshot->sn_xcnt; jj++)
+				client_log(("%d%c ", snapshot->sn_xip[jj],
+						((jj + 1) == snapshot->sn_xcnt) ? ')' : ','));
+			client_log(("\n"));
+		}
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		PGXC_NodeId nodes[5];
+		nodes[0] = 1;
+		nodes[1] = 1;
+
+		if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+			client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+		else
+			client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		if (ii % 2 == 0)
+		{
+			if (!abort_transaction(conn, gxid[ii]))
+				client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+		}
+		else
+		{
+			if (!commit_transaction(conn, gxid[ii]))
+				client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+		}
+	}
+
+	GTMPQfinish(conn);
+	return 0;
+}
diff --git a/src/gtm/client/test/test_snapperf.c b/src/gtm/client/test/test_snapperf.c
new file mode 100644
index 0000000000..bc0e511e2b
--- /dev/null
+++ b/src/gtm/client/test/test_snapperf.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x)
+
+int
+main(int argc, char *argv[])
+{
+	int ii;
+	int jj;
+
+#define TXN_COUNT		10000
+#define LOOP_COUNT		10
+	
+	GlobalTransactionId gxid[TXN_COUNT];
+	GTM_Conn *conn;
+
+	conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+	if (conn == NULL)
+	{
+		client_log(("Error in connection\n"));
+		exit(1);
+	}
+
+	for (jj = 0; jj < LOOP_COUNT; jj++)
+	{
+		for (ii = 0; ii < TXN_COUNT; ii++)
+		{
+			int kk;
+			GTM_Snapshot snapshot;
+
+			gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+			if (gxid[ii] != InvalidGlobalTransactionId)
+				client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("BEGIN transaction failed for ii=%d\n", ii));
+			snapshot = get_snapshot(conn, gxid[ii], true);
+
+
+			if (ii % 2 == 0)
+			{
+				if (!abort_transaction(conn, gxid[ii]))
+					client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+				else
+					client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+			}
+			else
+			{
+				if (!commit_transaction(conn, gxid[ii]))
+					client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+				else
+					client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+			}
+		}
+	}
+
+	GTMPQfinish(conn);
+	return 0;
+}
diff --git a/src/gtm/client/test/test_txn.c b/src/gtm/client/test/test_txn.c
new file mode 100644
index 0000000000..01ed3decbd
--- /dev/null
+++ b/src/gtm/client/test/test_txn.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+
+#define client_log(x)	printf x
+
+int
+main(int argc, char *argv[])
+{
+	int ii;
+	GlobalTransactionId gxid[4000];
+	GTM_Conn *conn;
+
+	for (ii = 0; ii < 3; ii++)
+		fork();
+
+	conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1");
+	if (conn == NULL)
+	{
+		client_log(("Error in connection\n"));
+		exit(1);
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		gxid[ii] = begin_transaction(conn, GTM_ISOLATION_SERIALIZABLE);
+		if (gxid[ii] != InvalidGlobalTransactionId)
+			client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+		else
+			client_log(("BEGIN transaction failed for ii=%d\n", ii));
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		PGXC_NodeId nodes[5];
+		nodes[0] = 1;
+		nodes[1] = 1;
+
+		if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+			client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+		else
+			client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+	}
+
+	for (ii = 0; ii < 20; ii++)
+	{
+		if (ii % 2 == 0)
+		{
+			if (!abort_transaction(conn, gxid[ii]))
+				client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+		}
+		else
+		{
+			if (!commit_transaction(conn, gxid[ii]))
+				client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+		}
+	}
+
+	GTMPQfinish(conn);
+	return 0;
+}
diff --git a/src/gtm/client/test/test_txnperf.c b/src/gtm/client/test/test_txnperf.c
new file mode 100644
index 0000000000..174f0a8bab
--- /dev/null
+++ b/src/gtm/client/test/test_txnperf.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ */
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/gtm_client.h"
+#include <sys/time.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define client_log(x)
+
+extern int      optind;
+extern char *optarg;
+
+/* Calculate time difference */
+static void
+diffTime(struct timeval *t1, struct timeval *t2, struct timeval *result)
+{
+    int sec = t1->tv_sec - t2->tv_sec;
+    int usec = t1->tv_usec - t2->tv_usec;
+    if (usec < 0)
+    {
+        usec += 1000000;
+        sec--;
+    }
+    result->tv_sec = sec;
+    result->tv_usec = usec;
+}
+
+/*
+ * Help display should match 
+ */
+static void
+help(const char *progname)
+{
+	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -h hostname     GTM proxy/server hostname/IP\n"));
+	printf(_("  -p port         GTM proxy/serevr port number\n"));
+	printf(_("  -c count        Number of clients\n"));
+	printf(_("  -n count        Number of transactions per client\n"));
+	printf(_("  -s count        Number of statements per transaction\n"));
+	printf(_("  -i id           Coordinator ID\n"));
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ii;
+	int jj;
+	int kk;
+	char connect_string[100];
+	int gtmport;
+	int coordinator_id;
+	int nclients;
+	int ntxns_per_cli;
+	int nstmts_per_txn;
+	char *gtmhost;
+	char opt;
+	struct timeval starttime, endtime, diff;
+	FILE *fp;
+	FILE *fp2;
+	char buf[1024];
+	int testid, this_testid, max_testid;
+	int snapsize = 0;
+	float avg_sanpsize = 0;
+	pid_t child_pids[1024];
+	pid_t parent_pid;
+
+#define TXN_COUNT		1000
+
+	GlobalTransactionId gxid[TXN_COUNT];
+	GTM_Conn *conn;
+	char test_output[256], test_end[256], test_output_csv[256];
+	char system_cmd[1024];
+
+	/*
+	 * Catch standard options before doing much else
+	 */
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			help(argv[0]);
+			exit(0);
+		}
+	}
+
+	 /*
+         * Parse the command like options and set variables
+         */
+	while ((opt = getopt(argc, argv, "h:p:c:n:s:i:")) != -1)
+        {
+		switch (opt)
+		{
+			case 'h':
+				gtmhost = strdup(optarg);
+				break;
+
+			case 'p':
+				gtmport = atoi(optarg);
+				break;
+
+			case 'c':
+				nclients = atoi(optarg);
+				break;
+
+			case 'n':
+				ntxns_per_cli = atoi(optarg);
+				break;
+
+			case 's':
+				nstmts_per_txn = atoi(optarg);
+				break;
+
+			case 'i':
+				coordinator_id = atoi(optarg);
+				sprintf(test_output, "TEST_OUTPUT_%d\0", coordinator_id);
+				sprintf(test_end, "TEST_END_%d\0", coordinator_id);
+				sprintf(test_output_csv, "TEST_OUTPUT_%d.CSV\0", coordinator_id);
+				break;
+
+			default:
+				fprintf(stderr, "Unrecognized option %c\n", opt);
+				help(argv[0]);
+				exit(0);
+		}
+	}
+		
+	sprintf(connect_string, "host=%s port=%d coordinator_id=%d", gtmhost, gtmport, coordinator_id);
+
+	sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output);
+	system(system_cmd);
+	sprintf(system_cmd, "date >> %s", test_output);
+	system(system_cmd);
+	sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output);
+	system(system_cmd);
+
+	fp = fopen(test_output, "a+");
+	fp2 = fopen(test_output_csv, "a+");
+	
+	max_testid = 0;
+	while (fgets(buf, 1024, fp) != NULL)
+	{
+		if (sscanf(buf, "TEST-ID: %d", &testid) == 1)
+		{
+			if (max_testid < testid)
+				max_testid = testid;
+		}
+	}
+	
+	this_testid = max_testid + 1;
+
+	fprintf(fp, "TEST-ID: %d", this_testid);
+	fprintf(fp, "\n\n");
+	fflush(fp);
+
+	parent_pid = getpid();
+
+	gettimeofday(&starttime, NULL);
+
+	/*
+	 * Start as many clients 
+	 */
+	for (ii = 1; ii < nclients; ii++)
+	{
+		int cpid;
+		if ((cpid = fork()) == 0)
+			break;
+		else
+			child_pids[ii-1] = cpid;
+	}
+
+	if (getpid() == parent_pid)
+		fprintf(stderr, "started %d clients\n", nclients);
+
+	conn = PQconnectGTM(connect_string);
+	if (conn == NULL)
+	{
+		client_log(("Error in connection\n"));
+		exit(1);
+	}
+
+	if (getpid() != parent_pid)
+		gettimeofday(&starttime, NULL);
+
+	snapsize = 0;
+
+	for (jj = 0; jj <= ntxns_per_cli / TXN_COUNT; jj++)
+	{
+		for (ii = 0; ii < TXN_COUNT; ii++)
+		{
+			PGXC_NodeId nodes[5];
+
+			if ((jj * TXN_COUNT) + ii >= ntxns_per_cli)
+				break;
+
+			gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC);
+			if (gxid[ii] != InvalidGlobalTransactionId)
+				client_log(("Started a new transaction (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("BEGIN transaction failed for ii=%d\n", ii));
+
+			for (kk = 0; kk < nstmts_per_txn; kk++)
+			{
+				GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true);
+				snapsize += snapshot->sn_xcnt;
+			}
+
+			nodes[0] = 1;
+			nodes[1] = 1;
+
+			if (!prepare_transaction(conn, gxid[ii], 2, nodes))
+				client_log(("PREPARE successful (GXID:%u)\n", gxid[ii]));
+			else
+				client_log(("PREPARE failed (GXID:%u)\n", gxid[ii]));
+
+			if (ii % 2 == 0)
+			{
+				if (!abort_transaction(conn, gxid[ii]))
+					client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii]));
+				else
+					client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii]));
+			}
+			else
+			{
+				if (!commit_transaction(conn, gxid[ii]))
+					client_log(("COMMIT successful (GXID:%u)\n", gxid[ii]));
+				else
+					client_log(("COMMIT failed (GXID:%u)\n", gxid[ii]));
+			}
+		}
+
+		fprintf(stderr, "client [%d] finished %d transactions\n", getpid(), (jj * TXN_COUNT) + ii);
+	}
+	
+	GTMPQfinish(conn);
+
+	if (parent_pid == getpid())
+	{
+		for (ii = 1; ii < nclients; ii++)
+			wait(NULL);
+
+		gettimeofday(&endtime, NULL);
+		diffTime(&endtime, &starttime, &diff);
+		avg_sanpsize =  ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn);
+
+		fprintf(fp, "\n");
+		fprintf(fp, "Num of client: %d\n", nclients);
+		fprintf(fp, "Num of txns/client: %d\n", ntxns_per_cli);
+		fprintf(fp, "Num of statements/txn: %d\n", nstmts_per_txn);
+		fprintf(fp, "TPS: %2f\n", (ntxns_per_cli * nclients) / ((float)((diff.tv_sec * 1000000) + diff.tv_usec)/1000000));
+		fprintf(fp, "Total snapshot size: %d\n", snapsize);
+		fprintf(fp, "Average snapshot size: %f\n", avg_sanpsize);
+	
+		fprintf(fp, "Time: %d.%d\n", diff.tv_sec, diff.tv_usec);
+		fprintf(fp, "\n");
+
+		sprintf(system_cmd, "touch %s\0", test_end);
+		system(system_cmd);
+	}
+	else
+	{
+		gettimeofday(&endtime, NULL);
+		diffTime(&endtime, &starttime, &diff);
+		avg_sanpsize =  ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn);
+	}
+
+	flock(fileno(fp2), LOCK_EX);
+	if (parent_pid != getpid())
+		fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,false\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize);
+	else
+		fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,true\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize);
+		
+	flock(fileno(fp2), LOCK_UN);
+	fclose(fp2);
+
+	fclose(fp);
+	
+	return 0;
+}
diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile
new file mode 100644
index 0000000000..104382c9c9
--- /dev/null
+++ b/src/gtm/common/Makefile
@@ -0,0 +1,25 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=gtm
+
+
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+
+OBJS=aset.o mcxt.o elog.o assert.o stringinfo.o gtm_lock.o gtm_list.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+	rm -f $(OBJS)
+	rm -f libgtm.so libgtm.so.1 libgtm.so.1.0
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/common/aset.c b/src/gtm/common/aset.c
new file mode 100644
index 0000000000..aa9533009a
--- /dev/null
+++ b/src/gtm/common/aset.c
@@ -0,0 +1,1261 @@
+/*-------------------------------------------------------------------------
+ *
+ * aset.c
+ *	  Allocation set definitions.
+ *
+ * AllocSet is our standard implementation of the abstract MemoryContext
+ * type.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/aset.c,v 1.77 2008/04/11 22:54:23 tgl Exp $
+ *
+ * NOTE:
+ *	This is a new (Feb. 05, 1999) implementation of the allocation set
+ *	routines. AllocSet...() does not use OrderedSet...() any more.
+ *	Instead it manages allocations in a block pool by itself, combining
+ *	many small allocations in a few bigger blocks. AllocSetFree() normally
+ *	doesn't free() memory really. It just add's the free'd area to some
+ *	list for later reuse by AllocSetAlloc(). All memory blocks are free()'d
+ *	at once on AllocSetReset(), which happens when the memory context gets
+ *	destroyed.
+ *				Jan Wieck
+ *
+ *	Performance improvement from Tom Lane, 8/99: for extremely large request
+ *	sizes, we do want to be able to give the memory back to free() as soon
+ *	as it is pfree()'d.  Otherwise we risk tying up a lot of memory in
+ *	freelist entries that might never be usable.  This is specially needed
+ *	when the caller is repeatedly repalloc()'ing a block bigger and bigger;
+ *	the previous instances of the block were guaranteed to be wasted until
+ *	AllocSetReset() under the old way.
+ *
+ *	Further improvement 12/00: as the code stood, request sizes in the
+ *	midrange between "small" and "large" were handled very inefficiently,
+ *	because any sufficiently large free chunk would be used to satisfy a
+ *	request, even if it was much larger than necessary.  This led to more
+ *	and more wasted space in allocated chunks over time.  To fix, get rid
+ *	of the midrange behavior: we now handle only "small" power-of-2-size
+ *	chunks as chunks.  Anything "large" is passed off to malloc().	Change
+ *	the number of freelists to change the small/large boundary.
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.	To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+/* Define this to detail debug alloc information */
+/* #define HAVE_ALLOCINFO */
+
+/*--------------------
+ * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS),
+ * for k = 0 .. ALLOCSET_NUM_FREELISTS-1.
+ *
+ * Note that all chunks in the freelists have power-of-2 sizes.  This
+ * improves recyclability: we may waste some space, but the wasted space
+ * should stay pretty constant as requests are made and released.
+ *
+ * A request too large for the last freelist is handled by allocating a
+ * dedicated block from malloc().  The block still has a block header and
+ * chunk header, but when the chunk is freed we'll return the whole block
+ * to malloc(), not put it on our freelists.
+ *
+ * CAUTION: ALLOC_MINBITS must be large enough so that
+ * 1<<ALLOC_MINBITS is at least MAXALIGN,
+ * or we may fail to align the smallest chunks adequately.
+ * 8-byte alignment is enough on all currently known machines.
+ *
+ * With the current parameters, request sizes up to 8K are treated as chunks,
+ * larger requests go into dedicated blocks.  Change ALLOCSET_NUM_FREELISTS
+ * to adjust the boundary point.
+ *--------------------
+ */
+
+#define ALLOC_MINBITS		3	/* smallest chunk size is 8 bytes */
+#define ALLOCSET_NUM_FREELISTS	11
+#define ALLOC_CHUNK_LIMIT	(1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS))
+/* Size of largest chunk that we use a fixed size for */
+
+/*--------------------
+ * The first block allocated for an allocset has size initBlockSize.
+ * Each time we have to allocate another block, we double the block size
+ * (if possible, and without exceeding maxBlockSize), so as to reduce
+ * the bookkeeping load on malloc().
+ *
+ * Blocks allocated to hold oversize chunks do not follow this rule, however;
+ * they are just however big they need to be to hold that single chunk.
+ *--------------------
+ */
+
+#define ALLOC_BLOCKHDRSZ	MAXALIGN(sizeof(AllocBlockData))
+#define ALLOC_CHUNKHDRSZ	MAXALIGN(sizeof(AllocChunkData))
+
+typedef struct AllocBlockData *AllocBlock;		/* forward reference */
+typedef struct AllocChunkData *AllocChunk;
+
+/*
+ * AllocPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *AllocPointer;
+
+/*
+ * AllocSetContext is our standard implementation of MemoryContext.
+ *
+ * Note: isReset means there is nothing for AllocSetReset to do.  This is
+ * different from the aset being physically empty (empty blocks list) because
+ * we may still have a keeper block.  It's also different from the set being
+ * logically empty, because we don't attempt to detect pfree'ing the last
+ * active chunk.
+ */
+typedef struct AllocSetContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Info about storage allocated in this context: */
+	AllocBlock	blocks;			/* head of list of blocks in this set */
+	AllocChunk	freelist[ALLOCSET_NUM_FREELISTS];		/* free chunk lists */
+	bool		isReset;		/* T = no space alloced since last reset */
+	/* Allocation parameters for this context: */
+	Size		initBlockSize;	/* initial block size */
+	Size		maxBlockSize;	/* maximum block size */
+	Size		nextBlockSize;	/* next block size to allocate */
+	Size		allocChunkLimit;	/* effective chunk size limit */
+	AllocBlock	keeper;			/* if not NULL, keep this block over resets */
+} AllocSetContext;
+
+typedef AllocSetContext *AllocSet;
+
+/*
+ * AllocBlock
+ *		An AllocBlock is the unit of memory that is obtained by aset.c
+ *		from malloc().	It contains one or more AllocChunks, which are
+ *		the units requested by palloc() and freed by pfree().  AllocChunks
+ *		cannot be returned to malloc() individually, instead they are put
+ *		on freelists by pfree() and re-used by the next palloc() that has
+ *		a matching request size.
+ *
+ *		AllocBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct AllocBlockData
+{
+	AllocSet	aset;			/* aset that owns this block */
+	AllocBlock	next;			/* next block in aset's blocks list */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+} AllocBlockData;
+
+/*
+ * AllocChunk
+ *		The prefix of each piece of memory in an AllocBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ */
+typedef struct AllocChunkData
+{
+	/* aset is the owning aset if allocated, or the freelist link if free */
+	void	   *aset;
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif
+} AllocChunkData;
+
+/*
+ * AllocPointerIsValid
+ *		True iff pointer is valid allocation pointer.
+ */
+#define AllocPointerIsValid(pointer) PointerIsValid(pointer)
+
+/*
+ * AllocSetIsValid
+ *		True iff set is valid allocation set.
+ */
+#define AllocSetIsValid(set) PointerIsValid(set)
+
+#define AllocPointerGetChunk(ptr)	\
+					((AllocChunk)(((char *)(ptr)) - ALLOC_CHUNKHDRSZ))
+#define AllocChunkGetPointer(chk)	\
+					((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for AllocSet contexts.
+ */
+static void *AllocSetAlloc(MemoryContext context, Size size);
+static void AllocSetFree(MemoryContext context, void *pointer);
+static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size);
+static void AllocSetInit(MemoryContext context);
+static void AllocSetReset(MemoryContext context);
+static void AllocSetDelete(MemoryContext context);
+static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer);
+static bool AllocSetIsEmpty(MemoryContext context);
+static void AllocSetStats(MemoryContext context, int level);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void AllocSetCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for AllocSet contexts.
+ */
+static MemoryContextMethods AllocSetMethods = {
+	AllocSetAlloc,
+	AllocSetFree,
+	AllocSetRealloc,
+	AllocSetInit,
+	AllocSetReset,
+	AllocSetDelete,
+	AllocSetGetChunkSpace,
+	AllocSetIsEmpty,
+	AllocSetStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,AllocSetCheck
+#endif
+};
+
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define AllocFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define AllocAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define AllocFreeInfo(_cxt, _chunk)
+#define AllocAllocInfo(_cxt, _chunk)
+#endif
+
+/* ----------
+ * AllocSetFreeIndex -
+ *
+ *		Depending on the size of an allocation compute which freechunk
+ *		list of the alloc set it belongs to.  Caller must have verified
+ *		that size <= ALLOC_CHUNK_LIMIT.
+ * ----------
+ */
+static inline int
+AllocSetFreeIndex(Size size)
+{
+	int			idx = 0;
+
+	if (size > 0)
+	{
+		size = (size - 1) >> ALLOC_MINBITS;
+		while (size != 0)
+		{
+			idx++;
+			size >>= 1;
+		}
+		Assert(idx < ALLOCSET_NUM_FREELISTS);
+	}
+
+	return idx;
+}
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ */
+static void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	int			ctr;
+
+	ctr = save_ctr;
+	while (size-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	save_ctr = ctr;
+}
+
+#endif /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * AllocSetContextCreate
+ *		Create a new AllocSet context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * minContextSize: minimum context size
+ * initBlockSize: initial allocation block size
+ * maxBlockSize: maximum allocation block size
+ */
+MemoryContext
+AllocSetContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size minContextSize,
+					  Size initBlockSize,
+					  Size maxBlockSize,
+					  bool isShared)
+{
+	AllocSet	context;
+
+	/* Do the type-independent part of context creation */
+	context = (AllocSet) MemoryContextCreate(sizeof(AllocSetContext),
+											 &AllocSetMethods,
+											 parent,
+											 name);
+
+	/*
+	 * Make sure alloc parameters are reasonable, and save them.
+	 *
+	 * We somewhat arbitrarily enforce a minimum 1K block size.
+	 */
+	initBlockSize = MAXALIGN(initBlockSize);
+	if (initBlockSize < 1024)
+		initBlockSize = 1024;
+	maxBlockSize = MAXALIGN(maxBlockSize);
+	if (maxBlockSize < initBlockSize)
+		maxBlockSize = initBlockSize;
+	context->initBlockSize = initBlockSize;
+	context->maxBlockSize = maxBlockSize;
+	context->nextBlockSize = initBlockSize;
+
+	/*
+	 * Compute the allocation chunk size limit for this context.  It can't be
+	 * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists.
+	 * If maxBlockSize is small then requests exceeding the maxBlockSize
+	 * should be treated as large chunks, too.	We have to have
+	 * allocChunkLimit a power of two, because the requested and
+	 * actually-allocated sizes of any chunk must be on the same side of the
+	 * limit, else we get confused about whether the chunk is "big".
+	 */
+	context->allocChunkLimit = ALLOC_CHUNK_LIMIT;
+	while (context->allocChunkLimit >
+		   (Size) (maxBlockSize - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ))
+		context->allocChunkLimit >>= 1;
+
+	/*
+	 * Grab always-allocated space, if requested
+	 */
+	if (minContextSize > ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)
+	{
+		Size		blksize = MAXALIGN(minContextSize);
+		AllocBlock	block;
+
+		block = (AllocBlock) malloc(blksize);
+		if (block == NULL)
+		{
+			MemoryContextStats(TopMemoryContext);
+			ereport(ERROR,
+					(ENOMEM,
+					 errmsg("out of memory"),
+					 errdetail("Failed while creating memory context \"%s\".",
+							   name)));
+		}
+		block->aset = context;
+		block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+		block->next = context->blocks;
+		context->blocks = block;
+		/* Mark block as not to be released at reset time */
+		context->keeper = block;
+	}
+
+	context->isReset = true;
+	context->header.is_shared = isShared;
+	if (isShared)
+		GTM_RWLockInit(&context->header.lock);
+
+	return (MemoryContext) context;
+}
+
+/*
+ * AllocSetInit
+ *		Context-type-specific initialization routine.
+ *
+ * This is called by MemoryContextCreate() after setting up the
+ * generic MemoryContext fields and before linking the new context
+ * into the context tree.  We must do whatever is needed to make the
+ * new context minimally valid for deletion.  We must *not* risk
+ * failure --- thus, for example, allocating more memory is not cool.
+ * (AllocSetContextCreate can allocate memory when it gets control
+ * back, however.)
+ */
+static void
+AllocSetInit(MemoryContext context)
+{
+	/*
+	 * Since MemoryContextCreate already zeroed the context node, we don't
+	 * have to do anything here: it's already OK.
+	 */
+}
+
+/*
+ * AllocSetReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * Actually, this routine has some discretion about what to do.
+ * It should mark all allocated chunks freed, but it need not necessarily
+ * give back all the resources the set owns.  Our actual implementation is
+ * that we hang onto any "keeper" block specified for the set.	In this way,
+ * we don't thrash malloc() when a context is repeatedly reset after small
+ * allocations, which is typical behavior for per-tuple contexts.
+ */
+static void
+AllocSetReset(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block;
+
+	AssertArg(AllocSetIsValid(set));
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+	/* Nothing to do if no pallocs since startup or last reset */
+	if (set->isReset)
+	{
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		return;
+	}
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	AllocSetCheck(context);
+#endif
+
+	/* Clear chunk freelists */
+	MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+	block = set->blocks;
+
+	/* New blocks list is either empty or just the keeper block */
+	set->blocks = set->keeper;
+
+	while (block != NULL)
+	{
+		AllocBlock	next = block->next;
+
+		if (block == set->keeper)
+		{
+			/* Reset the block, but don't return it to malloc */
+			char	   *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ;
+
+#ifdef CLOBBER_FREED_MEMORY
+			/* Wipe freed memory for debugging purposes */
+			memset(datastart, 0x7F, block->freeptr - datastart);
+#endif
+			block->freeptr = datastart;
+			block->next = NULL;
+		}
+		else
+		{
+			/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+			/* Wipe freed memory for debugging purposes */
+			memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+			free(block);
+		}
+		block = next;
+	}
+
+	/* Reset block size allocation sequence, too */
+	set->nextBlockSize = set->initBlockSize;
+
+	set->isReset = true;
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetDelete
+ *		Frees all memory which is allocated in the given set,
+ *		in preparation for deletion of the set.
+ *
+ * Unlike AllocSetReset, this *must* free all resources of the set.
+ * But note we are not responsible for deleting the context node itself.
+ */
+static void
+AllocSetDelete(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block = set->blocks;
+
+	AssertArg(AllocSetIsValid(set));
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	AllocSetCheck(context);
+#endif
+
+	/* Make it look empty, just in case... */
+	MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+	set->blocks = NULL;
+	set->keeper = NULL;
+
+	while (block != NULL)
+	{
+		AllocBlock	next = block->next;
+
+#ifdef CLOBBER_FREED_MEMORY
+		/* Wipe freed memory for debugging purposes */
+		memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+		free(block);
+		block = next;
+	}
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetAlloc
+ *		Returns pointer to allocated memory of given size; memory is added
+ *		to the set.
+ */
+static void *
+AllocSetAlloc(MemoryContext context, Size size)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block;
+	AllocChunk	chunk;
+	int			fidx;
+	Size		chunk_size;
+	Size		blksize;
+
+	AssertArg(AllocSetIsValid(set));
+
+	/*
+	 * If this is a shared context, make it thread safe by acquiring
+	 * appropriate lock
+	 */
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+	/*
+	 * If requested size exceeds maximum for chunks, allocate an entire block
+	 * for this request.
+	 */
+	if (size > set->allocChunkLimit)
+	{
+		chunk_size = MAXALIGN(size);
+		blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		block = (AllocBlock) malloc(blksize);
+		if (block == NULL)
+		{
+			MemoryContextStats(TopMemoryContext);
+			if (MemoryContextIsShared(context))
+				MemoryContextUnlock(context);
+			ereport(ERROR,
+					(ENOMEM,
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %lu.",
+							   (unsigned long) size)));
+		}
+		block->aset = set;
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+		chunk->aset = set;
+		chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+		/*
+		 * Stick the new block underneath the active allocation block, so that
+		 * we don't lose the use of the space remaining therein.
+		 */
+		if (set->blocks != NULL)
+		{
+			block->next = set->blocks->next;
+			set->blocks->next = block;
+		}
+		else
+		{
+			block->next = NULL;
+			set->blocks = block;
+		}
+
+		set->isReset = false;
+
+		AllocAllocInfo(set, chunk);
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		return AllocChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Request is small enough to be treated as a chunk.  Look in the
+	 * corresponding free list to see if there is a free chunk we could reuse.
+	 * If one is found, remove it from the free list, make it again a member
+	 * of the alloc set and return its data address.
+	 */
+	fidx = AllocSetFreeIndex(size);
+	chunk = set->freelist[fidx];
+	if (chunk != NULL)
+	{
+		Assert(chunk->size >= size);
+
+		set->freelist[fidx] = (AllocChunk) chunk->aset;
+
+		chunk->aset = (void *) set;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk->size)
+			((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+		/* isReset must be false already */
+		Assert(!set->isReset);
+
+		AllocAllocInfo(set, chunk);
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		return AllocChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Choose the actual chunk size to allocate.
+	 */
+	chunk_size = (1 << ALLOC_MINBITS) << fidx;
+	Assert(chunk_size >= size);
+
+	/*
+	 * If there is enough room in the active allocation block, we will put the
+	 * chunk into that block.  Else must start a new one.
+	 */
+	if ((block = set->blocks) != NULL)
+	{
+		Size		availspace = block->endptr - block->freeptr;
+
+		if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ))
+		{
+			/*
+			 * The existing active (top) block does not have enough room for
+			 * the requested allocation, but it might still have a useful
+			 * amount of space in it.  Once we push it down in the block list,
+			 * we'll never try to allocate more space from it. So, before we
+			 * do that, carve up its free space into chunks that we can put on
+			 * the set's freelists.
+			 *
+			 * Because we can only get here when there's less than
+			 * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
+			 * more than ALLOCSET_NUM_FREELISTS-1 times.
+			 */
+			while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
+			{
+				Size		availchunk = availspace - ALLOC_CHUNKHDRSZ;
+				int			a_fidx = AllocSetFreeIndex(availchunk);
+
+				/*
+				 * In most cases, we'll get back the index of the next larger
+				 * freelist than the one we need to put this chunk on.	The
+				 * exception is when availchunk is exactly a power of 2.
+				 */
+				if (availchunk != (1 << (a_fidx + ALLOC_MINBITS)))
+				{
+					a_fidx--;
+					Assert(a_fidx >= 0);
+					availchunk = (1 << (a_fidx + ALLOC_MINBITS));
+				}
+
+				chunk = (AllocChunk) (block->freeptr);
+
+				block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
+				availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
+
+				chunk->size = availchunk;
+#ifdef MEMORY_CONTEXT_CHECKING
+				chunk->requested_size = 0;		/* mark it free */
+#endif
+				chunk->aset = (void *) set->freelist[a_fidx];
+				set->freelist[a_fidx] = chunk;
+			}
+
+			/* Mark that we need to create a new block */
+			block = NULL;
+		}
+	}
+
+	/*
+	 * Time to create a new regular (multi-chunk) block?
+	 */
+	if (block == NULL)
+	{
+		Size		required_size;
+
+		/*
+		 * The first such block has size initBlockSize, and we double the
+		 * space in each succeeding block, but not more than maxBlockSize.
+		 */
+		blksize = set->nextBlockSize;
+		set->nextBlockSize <<= 1;
+		if (set->nextBlockSize > set->maxBlockSize)
+			set->nextBlockSize = set->maxBlockSize;
+
+		/*
+		 * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
+		 * space... but try to keep it a power of 2.
+		 */
+		required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		while (blksize < required_size)
+			blksize <<= 1;
+
+		/* Try to allocate it */
+		block = (AllocBlock) malloc(blksize);
+
+		/*
+		 * We could be asking for pretty big blocks here, so cope if malloc
+		 * fails.  But give up if there's less than a meg or so available...
+		 */
+		while (block == NULL && blksize > 1024 * 1024)
+		{
+			blksize >>= 1;
+			if (blksize < required_size)
+				break;
+			block = (AllocBlock) malloc(blksize);
+		}
+
+		if (block == NULL)
+		{
+			MemoryContextStats(TopMemoryContext);
+			if (MemoryContextIsShared(context))
+				MemoryContextUnlock(context);
+			ereport(ERROR,
+					(ENOMEM,
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %lu.",
+							   (unsigned long) size)));
+		}
+
+		block->aset = set;
+		block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/*
+		 * If this is the first block of the set, make it the "keeper" block.
+		 * Formerly, a keeper block could only be created during context
+		 * creation, but allowing it to happen here lets us have fast reset
+		 * cycling even for contexts created with minContextSize = 0; that way
+		 * we don't have to force space to be allocated in contexts that might
+		 * never need any space.  Don't mark an oversize block as a keeper,
+		 * however.
+		 */
+		if (set->keeper == NULL && blksize == set->initBlockSize)
+			set->keeper = block;
+
+		block->next = set->blocks;
+		set->blocks = block;
+	}
+
+	/*
+	 * OK, do the allocation
+	 */
+	chunk = (AllocChunk) (block->freeptr);
+
+	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+	Assert(block->freeptr <= block->endptr);
+
+	chunk->aset = (void *) set;
+	chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+	set->isReset = false;
+
+	AllocAllocInfo(set, chunk);
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+	return AllocChunkGetPointer(chunk);
+}
+
+/*
+ * AllocSetFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+AllocSetFree(MemoryContext context, void *pointer)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocChunk	chunk = AllocPointerGetChunk(pointer);
+
+	/*
+	 * Acquire appropriate lock for a shared memory context
+	 */
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+	AllocFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (((char *) pointer)[chunk->requested_size] != 0x7E)
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	if (chunk->size > set->allocChunkLimit)
+	{
+		/*
+		 * Big chunks are certain to have been allocated as single-chunk
+		 * blocks.	Find the containing block and return it to malloc().
+		 */
+		AllocBlock	block = set->blocks;
+		AllocBlock	prevblock = NULL;
+
+		while (block != NULL)
+		{
+			if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ))
+				break;
+			prevblock = block;
+			block = block->next;
+		}
+		if (block == NULL)
+		{
+			if (MemoryContextIsShared(context))
+				MemoryContextUnlock(context);
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+		}
+		/* let's just make sure chunk is the only one in the block */
+		Assert(block->freeptr == ((char *) block) +
+			   (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ));
+
+		/* OK, remove block from aset's list and free it */
+		if (prevblock == NULL)
+			set->blocks = block->next;
+		else
+			prevblock->next = block->next;
+#ifdef CLOBBER_FREED_MEMORY
+		/* Wipe freed memory for debugging purposes */
+		memset(block, 0x7F, block->freeptr - ((char *) block));
+#endif
+		free(block);
+	}
+	else
+	{
+		/* Normal case, put the chunk into appropriate freelist */
+		int			fidx = AllocSetFreeIndex(chunk->size);
+
+		chunk->aset = (void *) set->freelist[fidx];
+
+#ifdef CLOBBER_FREED_MEMORY
+		/* Wipe freed memory for debugging purposes */
+		memset(pointer, 0x7F, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Reset requested_size to 0 in chunks that are on freelist */
+		chunk->requested_size = 0;
+#endif
+		set->freelist[fidx] = chunk;
+	}
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+}
+
+/*
+ * AllocSetRealloc
+ *		Returns new pointer to allocated memory of given size; this memory
+ *		is added to the set.  Memory associated with given pointer is copied
+ *		into the new memory, and the old memory is freed.
+ */
+static void *
+AllocSetRealloc(MemoryContext context, void *pointer, Size size)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocChunk	chunk = AllocPointerGetChunk(pointer);
+	Size		oldsize = chunk->size;
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < oldsize)
+		if (((char *) pointer)[chunk->requested_size] != 0x7E)
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* isReset must be false already */
+	Assert(!set->isReset);
+
+	/*
+	 * Chunk sizes are aligned to power of 2 in AllocSetAlloc(). Maybe the
+	 * allocated area already is >= the new size.  (In particular, we always
+	 * fall out here if the requested size is a decrease.)
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > chunk->requested_size)
+			randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size,
+						  size - chunk->requested_size);
+#endif
+
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			((char *) pointer)[size] = 0x7E;
+#endif
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		return pointer;
+	}
+
+	if (oldsize > set->allocChunkLimit)
+	{
+		/*
+		 * The chunk must have been allocated as a single-chunk block.	Find
+		 * the containing block and use realloc() to make it bigger with
+		 * minimum space wastage.
+		 */
+		AllocBlock	block = set->blocks;
+		AllocBlock	prevblock = NULL;
+		Size		chksize;
+		Size		blksize;
+
+		while (block != NULL)
+		{
+			if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ))
+				break;
+			prevblock = block;
+			block = block->next;
+		}
+		if (block == NULL)
+		{
+			if (MemoryContextIsShared(context))
+				MemoryContextUnlock(context);
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+		}
+		/* let's just make sure chunk is the only one in the block */
+		Assert(block->freeptr == ((char *) block) +
+			   (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ));
+
+		/* Do the realloc */
+		chksize = MAXALIGN(size);
+		blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		block = (AllocBlock) realloc(block, blksize);
+		if (block == NULL)
+		{
+			MemoryContextStats(TopMemoryContext);
+			if (MemoryContextIsShared(context))
+				MemoryContextUnlock(context);
+			ereport(ERROR,
+					(ENOMEM,
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %lu.",
+							   (unsigned long) size)));
+		}
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		/* Update pointers since block has likely been moved */
+		chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+		if (prevblock == NULL)
+			set->blocks = block;
+		else
+			prevblock->next = block;
+		chunk->size = chksize;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size,
+					  size - chunk->requested_size);
+#endif
+
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk->size)
+			((char *) AllocChunkGetPointer(chunk))[size] = 0x7E;
+#endif
+
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		return AllocChunkGetPointer(chunk);
+	}
+	else
+	{
+		/*
+		 * Small-chunk case.  We just do this by brute force, ie, allocate a
+		 * new chunk and copy the data.  Since we know the existing data isn't
+		 * huge, this won't involve any great memcpy expense, so it's not
+		 * worth being smarter.  (At one time we tried to avoid memcpy when it
+		 * was possible to enlarge the chunk in-place, but that turns out to
+		 * misbehave unpleasantly for repeated cycles of
+		 * palloc/repalloc/pfree: the eventually freed chunks go into the
+		 * wrong freelist for the next initial palloc request, and so we leak
+		 * memory indefinitely.  See pgsql-hackers archives for 2007-08-11.)
+		 */
+		AllocPointer newPointer;
+
+		if (MemoryContextIsShared(context))
+			MemoryContextUnlock(context);
+		/* allocate new chunk */
+		newPointer = AllocSetAlloc((MemoryContext) set, size);
+
+		/* transfer existing data (certain to fit) */
+		memcpy(newPointer, pointer, oldsize);
+
+		/* free old chunk */
+		AllocSetFree((MemoryContext) set, pointer);
+
+		return newPointer;
+	}
+}
+
+/*
+ * AllocSetGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+AllocSetGetChunkSpace(MemoryContext context, void *pointer)
+{
+	AllocChunk	chunk = AllocPointerGetChunk(pointer);
+
+	return chunk->size + ALLOC_CHUNKHDRSZ;
+}
+
+/*
+ * AllocSetIsEmpty
+ *		Is an allocset empty of any allocated space?
+ */
+static bool
+AllocSetIsEmpty(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	bool		ret = false;
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+	/*
+	 * For now, we say "empty" only if the context is new or just reset. We
+	 * could examine the freelists to determine if all space has been freed,
+	 * but it's not really worth the trouble for present uses of this
+	 * functionality.
+	 */
+	if (set->isReset)
+		ret = true;
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+	return ret;
+}
+
+/*
+ * AllocSetStats
+ *		Displays stats about memory consumption of an allocset.
+ */
+static void
+AllocSetStats(MemoryContext context, int level)
+{
+	AllocSet	set = (AllocSet) context;
+	long		nblocks = 0;
+	long		nchunks = 0;
+	long		totalspace = 0;
+	long		freespace = 0;
+	AllocBlock	block;
+	AllocChunk	chunk;
+	int			fidx;
+	int			i;
+
+	/*
+	 * XXX The caller is most likely holding a lock for shared contextes. So
+	 * don't bother to lock it again (this might cause problem some time, so
+	 * revisit this later)
+	 */
+	for (block = set->blocks; block != NULL; block = block->next)
+	{
+		nblocks++;
+		totalspace += block->endptr - ((char *) block);
+		freespace += block->endptr - block->freeptr;
+	}
+	for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++)
+	{
+		for (chunk = set->freelist[fidx]; chunk != NULL;
+			 chunk = (AllocChunk) chunk->aset)
+		{
+			nchunks++;
+			freespace += chunk->size + ALLOC_CHUNKHDRSZ;
+		}
+	}
+
+	for (i = 0; i < level; i++)
+		fprintf(stderr, "  ");
+
+	fprintf(stderr,
+			"%s: %lu total in %ld blocks; %lu free (%ld chunks); %lu used\n",
+			set->header.name, totalspace, nblocks, freespace, nchunks,
+			totalspace - freespace);
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * AllocSetCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+AllocSetCheck(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	char	   *name = set->header.name;
+	AllocBlock	block;
+
+	/*
+	 * XXX The caller is most likely holding a lock for shared contextes. So
+	 * don't bother to lock it again (this might cause problem some time, so
+	 * revisit this later)
+	 */
+	for (block = set->blocks; block != NULL; block = block->next)
+	{
+		char	   *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ;
+		long		blk_used = block->freeptr - bpoz;
+		long		blk_data = 0;
+		long		nchunks = 0;
+
+		/*
+		 * Empty block - empty can be keeper-block only
+		 */
+		if (!blk_used)
+		{
+			if (set->keeper != block)
+				elog(WARNING, "problem in alloc set %s: empty block %p",
+					 name, block);
+		}
+
+		/*
+		 * Chunk walker
+		 */
+		while (bpoz < block->freeptr)
+		{
+			AllocChunk	chunk = (AllocChunk) bpoz;
+			Size		chsize,
+						dsize;
+			char	   *chdata_end;
+
+			chsize = chunk->size;		/* aligned chunk size */
+			dsize = chunk->requested_size;		/* real data */
+			chdata_end = ((char *) chunk) + (ALLOC_CHUNKHDRSZ + dsize);
+
+			/*
+			 * Check chunk size
+			 */
+			if (dsize > chsize)
+				elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p",
+					 name, chunk, block);
+			if (chsize < (1 << ALLOC_MINBITS))
+				elog(WARNING, "problem in alloc set %s: bad size %lu for chunk %p in block %p",
+					 name, (unsigned long) chsize, chunk, block);
+
+			/* single-chunk block? */
+			if (chsize > set->allocChunkLimit &&
+				chsize + ALLOC_CHUNKHDRSZ != blk_used)
+				elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p",
+					 name, chunk, block);
+
+			/*
+			 * If chunk is allocated, check for correct aset pointer. (If it's
+			 * free, the aset is the freelist pointer, which we can't check as
+			 * easily...)
+			 */
+			if (dsize > 0 && chunk->aset != (void *) set)
+				elog(WARNING, "problem in alloc set %s: bogus aset link in block %p, chunk %p",
+					 name, block, chunk);
+
+			/*
+			 * Check for overwrite of "unallocated" space in chunk
+			 */
+			if (dsize > 0 && dsize < chsize && *chdata_end != 0x7E)
+				elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p",
+					 name, block, chunk);
+
+			blk_data += chsize;
+			nchunks++;
+
+			bpoz += ALLOC_CHUNKHDRSZ + chsize;
+		}
+
+		if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used)
+			elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p",
+				 name, block);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/gtm/common/assert.c b/src/gtm/common/assert.c
new file mode 100644
index 0000000000..58b94481b3
--- /dev/null
+++ b/src/gtm/common/assert.c
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * assert.c
+ *	  Assert code.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/assert.c,v 1.35 2008/01/01 19:45:53 momjian Exp $
+ *
+ * NOTE
+ *	  This should eventually work with elog()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/assert.h"
+
+#include <unistd.h>
+
+bool assert_enabled = false;
+
+/*
+ * ExceptionalCondition - Handles the failure of an Assert()
+ *
+ * Note: this can't actually return, but we declare it as returning int
+ * because the TrapMacro() macro might get wonky otherwise.
+ */
+int
+ExceptionalCondition(const char *conditionName,
+					 const char *errorType,
+					 const char *fileName,
+					 int lineNumber)
+{
+	if (!PointerIsValid(conditionName)
+		|| !PointerIsValid(fileName)
+		|| !PointerIsValid(errorType))
+		fprintf(stderr, "TRAP: ExceptionalCondition: bad arguments\n");
+	else
+	{
+		fprintf(stderr, "TRAP: %s(\"%s\", File: \"%s\", Line: %d)\n",
+					 errorType, conditionName,
+					 fileName, lineNumber);
+	}
+
+	/* Usually this shouldn't be needed, but make sure the msg went out */
+	fflush(stderr);
+
+	abort();
+	return 0;
+}
diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
new file mode 100644
index 0000000000..626dc36925
--- /dev/null
+++ b/src/gtm/common/elog.c
@@ -0,0 +1,1117 @@
+/*-------------------------------------------------------------------------
+ *
+ * elog.c
+ *	  error logging and reporting
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.212 2009/01/19 15:34:23 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <fcntl.h>
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <ctype.h>
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/stringinfo.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_ext.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+
+#undef _
+#define _(x)	x
+
+/*
+ * Change this to something which is more appropriate.
+ *
+ * XXX The GTM should take command like argument to set the log file
+ */
+char *GTMLogFile = NULL;
+
+/* GUC parameters */
+int			Log_destination = LOG_DESTINATION_STDERR;
+
+/* Macro for checking errordata_stack_depth is reasonable */
+#define CHECK_STACK_DEPTH() \
+	do { \
+		if (errordata_stack_depth < 0) \
+		{ \
+			errordata_stack_depth = -1; \
+			ereport(ERROR, (errmsg_internal("errstart was not called"))); \
+		} \
+	} while (0)
+
+
+static void send_message_to_server_log(ErrorData *edata);
+static void send_message_to_frontend(Port *myport, ErrorData *edata);
+static char *expand_fmt_string(const char *fmt, ErrorData *edata);
+static const char *useful_strerror(int errnum);
+static const char *error_severity(int elevel);
+static void append_with_tabs(StringInfo buf, const char *str);
+static bool is_log_level_output(int elevel, int log_min_level);
+
+int	log_min_messages = WARNING;
+char	   *Log_line_prefix = "%l:%p:%m -";		/* format for extra log line info */
+
+#define FORMATTED_TS_LEN 128
+static char formatted_start_time[FORMATTED_TS_LEN];
+static char formatted_log_time[FORMATTED_TS_LEN];
+
+static void log_line_prefix(StringInfo buf);
+static void setup_formatted_log_time(void);
+/*
+ * setup formatted_log_time, for consistent times between CSV and regular logs
+ */
+static void
+setup_formatted_log_time(void)
+{
+	struct timeval tv;
+	time_t	stamp_time;
+	char		msbuf[8];
+
+	gettimeofday(&tv, NULL);
+	stamp_time = (time_t) tv.tv_sec;
+
+	strftime(formatted_log_time, FORMATTED_TS_LEN,
+				/* leave room for milliseconds... */
+				"%Y-%m-%d %H:%M:%S     %Z",
+				localtime(&stamp_time));
+
+	/* 'paste' milliseconds into place... */
+	sprintf(msbuf, ".%03d", (int) (tv.tv_usec / 1000));
+	strncpy(formatted_log_time + 19, msbuf, 4);
+}
+
+/*
+ * Format tag info for log lines; append to the provided buffer.
+ */
+static void
+log_line_prefix(StringInfo buf)
+{
+	/* static counter for line numbers */
+	static long log_line_number = 0;
+
+	/* has counter been reset in current process? */
+	static int	log_my_pid = 0;
+
+	int			format_len;
+	int			i;
+
+	/*
+	 * This is one of the few places where we'd rather not inherit a static
+	 * variable's value from the postmaster.  But since we will, reset it when
+	 * MyProcPid changes. MyStartTime also changes when MyProcPid does, so
+	 * reset the formatted start timestamp too.
+	 */
+	if (log_my_pid != MyThreadID)
+	{
+		log_line_number = 0;
+		log_my_pid = MyThreadID;
+		formatted_start_time[0] = '\0';
+	}
+	log_line_number++;
+
+	if (Log_line_prefix == NULL)
+		return;					/* in case guc hasn't run yet */
+
+	format_len = strlen(Log_line_prefix);
+
+	for (i = 0; i < format_len; i++)
+	{
+		if (Log_line_prefix[i] != '%')
+		{
+			/* literal char, just copy */
+			appendStringInfoChar(buf, Log_line_prefix[i]);
+			continue;
+		}
+		/* go to char after '%' */
+		i++;
+		if (i >= format_len)
+			break;				/* format error - ignore it */
+
+		/* process the option */
+		switch (Log_line_prefix[i])
+		{
+			case 'p':
+				appendStringInfo(buf, "%lu", MyThreadID);
+				break;
+			case 'l':
+				appendStringInfo(buf, "%ld", log_line_number);
+				break;
+			case 'm':
+				setup_formatted_log_time();
+				appendStringInfoString(buf, formatted_log_time);
+				break;
+			default:
+				/* format error - ignore it */
+				break;
+		}
+	}
+}
+
+/*
+ * errstart --- begin an error-reporting cycle
+ *
+ * Create a stack entry and store the given parameters in it.  Subsequently,
+ * errmsg() and perhaps other routines will be called to further populate
+ * the stack entry.  Finally, errfinish() will be called to actually process
+ * the error report.
+ *
+ * Returns TRUE in normal case.  Returns FALSE to short-circuit the error
+ * report (if it's a warning or lower and not to be reported anywhere).
+ */
+bool
+errstart(int elevel, const char *filename, int lineno,
+		 const char *funcname, const char *domain)
+{
+	ErrorData	*edata;
+	bool		output_to_server;
+	bool		output_to_client = false;
+	int			i;
+
+	/*
+	 * Check some cases in which we want to promote an error into a more
+	 * severe error.  None of this logic applies for non-error messages.
+	 */
+	if (elevel >= ERROR)
+	{
+		/*
+		 * If we are inside a critical section, all errors become PANIC
+		 * errors.	See miscadmin.h.
+		 */
+		if (CritSectionCount > 0)
+			elevel = PANIC;
+
+		/*
+		 * Check reasons for treating ERROR as FATAL:
+		 *
+		 * 1. we have no handler to pass the error to (implies we are in the
+		 * postmaster or in backend startup).
+		 *
+		 * 2. ExitOnAnyError mode switch is set (initdb uses this).
+		 *
+		 * 3. the error occurred after proc_exit has begun to run.	(It's
+		 * proc_exit's responsibility to see that this doesn't turn into
+		 * infinite recursion!)
+		 */
+		if (elevel == ERROR)
+		{
+			if (PG_exception_stack == NULL)
+				elevel = FATAL;
+		}
+
+		/*
+		 * If the error level is ERROR or more, errfinish is not going to
+		 * return to caller; therefore, if there is any stacked error already
+		 * in progress it will be lost.  This is more or less okay, except we
+		 * do not want to have a FATAL or PANIC error downgraded because the
+		 * reporting process was interrupted by a lower-grade error.  So check
+		 * the stack and make sure we panic if panic is warranted.
+		 */
+		for (i = 0; i <= errordata_stack_depth; i++)
+			elevel = Max(elevel, errordata[i].elevel);
+	}
+
+	output_to_server = is_log_level_output(elevel, log_min_messages);
+	output_to_client = (elevel >= ERROR);
+
+	/* Skip processing effort if non-error message will not be output */
+	if (elevel < ERROR && !output_to_server && !output_to_client)
+		return false;
+
+	/*
+	 * Okay, crank up a stack entry to store the info in.
+	 */
+
+	if (recursion_depth++ > 0 && elevel >= ERROR)
+	{
+		/*
+		 * Ooops, error during error processing.  Clear ErrorContext as
+		 * discussed at top of file.  We will not return to the original
+		 * error's reporter or handler, so we don't need it.
+		 */
+		MemoryContextReset(ErrorContext);
+	}
+
+	if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE)
+	{
+		/*
+		 * Wups, stack not big enough.	We treat this as a PANIC condition
+		 * because it suggests an infinite loop of errors during error
+		 * recovery.
+		 */
+		errordata_stack_depth = -1;		/* make room on stack */
+		ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded")));
+	}
+	/* Initialize data for this error frame */
+	edata = &errordata[errordata_stack_depth];
+	MemSet(edata, 0, sizeof(ErrorData));
+	edata->elevel = elevel;
+	edata->output_to_server = output_to_server;
+	edata->output_to_client = output_to_client;
+	edata->filename = filename;
+	edata->lineno = lineno;
+	edata->funcname = funcname;
+	/* errno is saved here so that error parameter eval can't change it */
+	edata->saved_errno = errno;
+
+	recursion_depth--;
+	return true;
+}
+
+/*
+ * errfinish --- end an error-reporting cycle
+ *
+ * Produce the appropriate error report(s) and pop the error stack.
+ *
+ * If elevel is ERROR or worse, control does not return to the caller.
+ * See elog.h for the error level definitions.
+ */
+void
+errfinish(int dummy,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	int			elevel = edata->elevel;
+
+	MemoryContext oldcontext;
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+
+	/*
+	 * Do processing in ErrorContext, which we hope has enough reserved space
+	 * to report an error.
+	 */
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+
+	/*
+	 * If ERROR (not more nor less) we pass it off to the current handler.
+	 * Printing it and popping the stack is the responsibility of the handler.
+	 */
+	if (elevel == ERROR)
+	{
+		/*
+		 * We do some minimal cleanup before longjmp'ing so that handlers can
+		 * execute in a reasonably sane state.
+		 */
+		CritSectionCount = 0;	/* should be unnecessary, but... */
+
+		/*
+		 * Note that we leave CurrentMemoryContext set to ErrorContext. The
+		 * handler should reset it to something else soon.
+		 */
+
+		recursion_depth--;
+		PG_RE_THROW();
+	}
+
+	/* Emit the message to the right places */
+	EmitErrorReport(MyPort);
+
+	/* Now free up subsidiary data attached to stack entry, and release it */
+	if (edata->message)
+		pfree(edata->message);
+	if (edata->detail)
+		pfree(edata->detail);
+	if (edata->detail_log)
+		pfree(edata->detail_log);
+	if (edata->hint)
+		pfree(edata->hint);
+	if (edata->context)
+		pfree(edata->context);
+	errordata_stack_depth--;
+
+	/* Exit error-handling context */
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+
+	/*
+	 * Perform error recovery action as specified by elevel.
+	 */
+	if (elevel == FATAL)
+	{
+		/*
+		 * fflush here is just to improve the odds that we get to see the
+		 * error message, in case things are so hosed that proc_exit crashes.
+		 * Any other code you might be tempted to add here should probably be
+		 * in an on_proc_exit or on_shmem_exit callback instead.
+		 */
+		fflush(stdout);
+		fflush(stderr);
+
+		/*
+		 * Do normal process-exit cleanup, then return exit code 1 to indicate
+		 * FATAL termination.  The postmaster may or may not consider this
+		 * worthy of panic, depending on which subprocess returns it.
+		 */
+		pthread_exit(NULL);
+	}
+
+	if (elevel >= PANIC)
+	{
+		fflush(stdout);
+		fflush(stderr);
+		abort();
+	}
+
+	/*
+	 * We reach here if elevel <= WARNING. OK to return to caller.
+	 */
+}
+
+/*
+ * This macro handles expansion of a format string and associated parameters;
+ * it's common code for errmsg(), errdetail(), etc.  Must be called inside
+ * a routine that is declared like "const char *fmt, ..." and has an edata
+ * pointer set up.	The message is assigned to edata->targetfield, or
+ * appended to it if appendval is true.  The message is subject to translation
+ * if translateit is true.
+ *
+ * Note: we pstrdup the buffer rather than just transferring its storage
+ * to the edata field because the buffer might be considerably larger than
+ * really necessary.
+ */
+#define EVALUATE_MESSAGE(targetfield, appendval, translateit)  \
+	{ \
+		char		   *fmtbuf; \
+		StringInfoData	buf; \
+		/* Expand %m in format string */ \
+		fmtbuf = expand_fmt_string(fmt, edata); \
+		initStringInfo(&buf); \
+		if ((appendval) && edata->targetfield) \
+			appendStringInfo(&buf, "%s\n", edata->targetfield); \
+		/* Generate actual output --- have to use appendStringInfoVA */ \
+		for (;;) \
+		{ \
+			va_list		args; \
+			bool		success; \
+			va_start(args, fmt); \
+			success = appendStringInfoVA(&buf, fmtbuf, args); \
+			va_end(args); \
+			if (success) \
+				break; \
+			enlargeStringInfo(&buf, buf.maxlen); \
+		} \
+		/* Done with expanded fmt */ \
+		pfree(fmtbuf); \
+		/* Save the completed message into the stack item */ \
+		if (edata->targetfield) \
+			pfree(edata->targetfield); \
+		edata->targetfield = pstrdup(buf.data); \
+		pfree(buf.data); \
+	}
+
+
+/*
+ * errmsg --- add a primary error message text to the current error
+ *
+ * In addition to the usual %-escapes recognized by printf, "%m" in
+ * fmt is replaced by the error message for the caller's value of errno.
+ *
+ * Note: no newline is needed at the end of the fmt string, since
+ * ereport will provide one for the output methods that need it.
+ */
+int
+errmsg(const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(message, false, true);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+	return 0;					/* return value does not matter */
+}
+
+
+/*
+ * errmsg_internal --- add a primary error message text to the current error
+ *
+ * This is exactly like errmsg() except that strings passed to errmsg_internal
+ * are not translated, and are customarily left out of the
+ * internationalization message dictionary.  This should be used for "can't
+ * happen" cases that are probably not worth spending translation effort on.
+ * We also use this for certain cases where we *must* not try to translate
+ * the message because the translation would fail and result in infinite
+ * error recursion.
+ */
+int
+errmsg_internal(const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(message, false, false);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+	return 0;					/* return value does not matter */
+}
+
+
+/*
+ * errdetail --- add a detail error message text to the current error
+ */
+int
+errdetail(const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(detail, false, true);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+	return 0;					/* return value does not matter */
+}
+
+
+/*
+ * errdetail_log --- add a detail_log error message text to the current error
+ */
+int
+errdetail_log(const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(detail_log, false, true);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+	return 0;					/* return value does not matter */
+}
+
+
+/*
+ * errhint --- add a hint error message text to the current error
+ */
+int
+errhint(const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(hint, false, true);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+	return 0;					/* return value does not matter */
+}
+
+
+
+/*
+ * errfunction --- add reporting function name to the current error
+ *
+ * This is used when backwards compatibility demands that the function
+ * name appear in messages sent to old-protocol clients.  Note that the
+ * passed string is expected to be a non-freeable constant string.
+ */
+int
+errfunction(const char *funcname)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+
+
+	edata->funcname = funcname;
+	edata->show_funcname = true;
+
+	return 0;					/* return value does not matter */
+}
+
+
+/*
+ * elog_start --- startup for old-style API
+ *
+ * All that we do here is stash the hidden filename/lineno/funcname
+ * arguments into a stack entry.
+ *
+ * We need this to be separate from elog_finish because there's no other
+ * portable way to deal with inserting extra arguments into the elog call.
+ * (If macros with variable numbers of arguments were portable, it'd be
+ * easy, but they aren't.)
+ */
+void
+elog_start(const char *filename, int lineno, const char *funcname)
+{
+	ErrorData  *edata;
+
+	if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE)
+	{
+		/*
+		 * Wups, stack not big enough.	We treat this as a PANIC condition
+		 * because it suggests an infinite loop of errors during error
+		 * recovery.  Note that the message is intentionally not localized,
+		 * else failure to convert it to client encoding could cause further
+		 * recursion.
+		 */
+		errordata_stack_depth = -1;		/* make room on stack */
+		ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded")));
+	}
+
+	edata = &errordata[errordata_stack_depth];
+	edata->filename = filename;
+	edata->lineno = lineno;
+	edata->funcname = funcname;
+	/* errno is saved now so that error parameter eval can't change it */
+	edata->saved_errno = errno;
+}
+
+/*
+ * elog_finish --- finish up for old-style API
+ */
+void
+elog_finish(int elevel, const char *fmt,...)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	MemoryContext oldcontext;
+
+	CHECK_STACK_DEPTH();
+
+	/*
+	 * Do errstart() to see if we actually want to report the message.
+	 */
+	errordata_stack_depth--;
+	errno = edata->saved_errno;
+	if (!errstart(elevel, edata->filename, edata->lineno, edata->funcname, NULL))
+		return;					/* nothing to do */
+
+	/*
+	 * Format error message just like errmsg_internal().
+	 */
+	recursion_depth++;
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	EVALUATE_MESSAGE(message, false, false);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+
+	/*
+	 * And let errfinish() finish up.
+	 */
+	errfinish(0);
+}
+
+/*
+ * Actual output of the top-of-stack error message
+ *
+ * In the ereport(ERROR) case this is called from GTM_ThreadMain(or not at all,
+ * if the error is caught by somebody).  For all other severity levels this
+ * is called by errfinish.
+ */
+void
+EmitErrorReport(void *argp)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	Port *myport= (Port *)argp;
+	MemoryContext oldcontext;
+
+	recursion_depth++;
+	CHECK_STACK_DEPTH();
+	oldcontext = MemoryContextSwitchTo(ErrorContext);
+
+	/* Send to server log, if enabled */
+	if (edata->output_to_server)
+		send_message_to_server_log(edata);
+
+	/* Send to client, if enabled */
+	if ((edata->output_to_client) && (myport != NULL))
+		send_message_to_frontend(myport, edata);
+
+	MemoryContextSwitchTo(oldcontext);
+	recursion_depth--;
+}
+
+/*
+ * FlushErrorState --- flush the error state after error recovery
+ *
+ * This should be called by an error handler after it's done processing
+ * the error; or as soon as it's done CopyErrorData, if it intends to
+ * do stuff that is likely to provoke another error.  You are not "out" of
+ * the error subsystem until you have done this.
+ */
+void
+FlushErrorState(void)
+{
+	/*
+	 * Reset stack to empty.  The only case where it would be more than one
+	 * deep is if we serviced an error that interrupted construction of
+	 * another message.  We assume control escaped out of that message
+	 * construction and won't ever go back.
+	 */
+	errordata_stack_depth = -1;
+	recursion_depth = 0;
+	/* Delete all data in ErrorContext */
+	MemoryContextResetAndDeleteChildren(ErrorContext);
+}
+
+
+
+/*
+ * pg_re_throw --- out-of-line implementation of PG_RE_THROW() macro
+ */
+void
+pg_re_throw(void)
+{
+	/* If possible, throw the error to the next outer setjmp handler */
+	if (PG_exception_stack != NULL)
+		siglongjmp(*PG_exception_stack, 1);
+	else
+	{
+		/*
+		 * If we get here, elog(ERROR) was thrown inside a PG_TRY block, which
+		 * we have now exited only to discover that there is no outer setjmp
+		 * handler to pass the error to.  Had the error been thrown outside
+		 * the block to begin with, we'd have promoted the error to FATAL, so
+		 * the correct behavior is to make it FATAL now; that is, emit it and
+		 * then call proc_exit.
+		 */
+		ErrorData  *edata = &errordata[errordata_stack_depth];
+
+		Assert(errordata_stack_depth >= 0);
+		Assert(edata->elevel == ERROR);
+		edata->elevel = FATAL;
+
+		/*
+		 * At least in principle, the increase in severity could have changed
+		 * where-to-output decisions, so recalculate.  This should stay in
+		 * sync with errstart(), which see for comments.
+		 */
+		edata->output_to_server = is_log_level_output(FATAL,
+													  log_min_messages);
+		edata->output_to_client = true;
+		errfinish(0);
+	}
+
+	/* We mustn't return... */
+	ExceptionalCondition("pg_re_throw tried to return", "FailedAssertion",
+						 __FILE__, __LINE__);
+
+	/*
+	 * Since ExceptionalCondition isn't declared noreturn because of
+	 * TrapMacro(), we need this to keep gcc from complaining.
+	 */
+	abort();
+}
+
+
+/*
+ * Initialization of error output file
+ */
+void
+DebugFileOpen(void)
+{
+	int			fd,
+				istty;
+
+	if (GTMLogFile[0])
+	{
+		/*
+		 * A debug-output file name was given.
+		 *
+		 * Make sure we can write the file, and find out if it's a tty.
+		 */
+		if ((fd = open(GTMLogFile, O_CREAT | O_APPEND | O_WRONLY,
+					   0666)) < 0)
+			ereport(FATAL,
+					(errno,
+				  errmsg("could not open file \"%s\": %m", GTMLogFile)));
+		istty = isatty(fd);
+		close(fd);
+
+		/*
+		 * Redirect our stderr to the debug output file.
+		 */
+		if (!freopen(GTMLogFile, "a", stderr))
+			ereport(FATAL,
+					(errno,
+					 errmsg("could not reopen file \"%s\" as stderr: %m",
+							GTMLogFile)));
+
+		/*
+		 * If the file is a tty and we're running under the postmaster, try to
+		 * send stdout there as well (if it isn't a tty then stderr will block
+		 * out stdout, so we may as well let stdout go wherever it was going
+		 * before).
+		 */
+		if (istty)
+			if (!freopen(GTMLogFile, "a", stdout))
+				ereport(FATAL,
+						(errno,
+						 errmsg("could not reopen file \"%s\" as stdout: %m",
+								GTMLogFile)));
+	}
+}
+
+/*
+ * Write error report to server's log
+ */
+static void
+send_message_to_server_log(ErrorData *edata)
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	formatted_log_time[0] = '\0';
+
+	log_line_prefix(&buf);
+	appendStringInfo(&buf, "%s:  ", error_severity(edata->elevel));
+
+	if (edata->message)
+		append_with_tabs(&buf, edata->message);
+	else
+		append_with_tabs(&buf, _("missing error text"));
+
+	appendStringInfoChar(&buf, '\n');
+
+	if (edata->detail_log)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("DETAIL:  "));
+		append_with_tabs(&buf, edata->detail_log);
+		appendStringInfoChar(&buf, '\n');
+	}
+	else if (edata->detail)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("DETAIL:  "));
+		append_with_tabs(&buf, edata->detail);
+		appendStringInfoChar(&buf, '\n');
+	}
+	if (edata->hint)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("HINT:  "));
+		append_with_tabs(&buf, edata->hint);
+		appendStringInfoChar(&buf, '\n');
+	}
+	if (edata->context)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("CONTEXT:  "));
+		append_with_tabs(&buf, edata->context);
+		appendStringInfoChar(&buf, '\n');
+	}
+
+	/* assume no newlines in funcname or filename... */
+	if (edata->funcname && edata->filename)
+	{
+		appendStringInfo(&buf, _("LOCATION:  %s, %s:%d\n"),
+						 edata->funcname, edata->filename,
+						 edata->lineno);
+	}
+	else if (edata->filename)
+	{
+		appendStringInfo(&buf, _("LOCATION:  %s:%d\n"),
+						 edata->filename, edata->lineno);
+	}
+
+	/* Write to stderr, if enabled */
+	if (Log_destination & LOG_DESTINATION_STDERR)
+		write(fileno(stderr), buf.data, buf.len);
+
+	pfree(buf.data);
+}
+
+/*
+ * Write error report to client
+ */
+static void
+send_message_to_frontend(Port *myport, ErrorData *edata)
+{
+	StringInfoData msgbuf;
+
+	/* 'N' (Notice) is for nonfatal conditions, 'E' is for errors */
+	pq_beginmessage(&msgbuf, (edata->elevel < ERROR) ? 'N' : 'E');
+
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+
+		proxyhdr.ph_conid = myport->conn_id;
+		/* Send the GTM Proxy header if we are dealing with a proxy */
+		pq_sendbytes(&msgbuf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+
+	pq_sendbyte(&msgbuf, PG_DIAG_SEVERITY);
+	pq_sendstring(&msgbuf, error_severity(edata->elevel));
+
+	/* M field is required per protocol, so always send something */
+	pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_PRIMARY);
+	if (edata->message)
+		pq_sendstring(&msgbuf, edata->message);
+	else
+		pq_sendstring(&msgbuf, _("missing error text"));
+
+	if (edata->detail)
+	{
+		pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_DETAIL);
+		pq_sendstring(&msgbuf, edata->detail);
+	}
+
+	/* detail_log is intentionally not used here */
+
+	if (edata->hint)
+	{
+		pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_HINT);
+		pq_sendstring(&msgbuf, edata->hint);
+	}
+
+	pq_sendbyte(&msgbuf, '\0');		/* terminator */
+
+	pq_endmessage(myport, &msgbuf);
+
+	/*
+	 * This flush is normally not necessary, since postgres.c will flush out
+	 * waiting data when control returns to the main loop. But it seems best
+	 * to leave it here, so that the client has some clue what happened if the
+	 * backend dies before getting back to the main loop ... error/notice
+	 * messages should not be a performance-critical path anyway, so an extra
+	 * flush won't hurt much ...
+	 */
+	pq_flush(myport);
+}
+
+/*
+ * Support routines for formatting error messages.
+ */
+
+
+/*
+ * expand_fmt_string --- process special format codes in a format string
+ *
+ * We must replace %m with the appropriate strerror string, since vsnprintf
+ * won't know what to do with it.
+ *
+ * The result is a palloc'd string.
+ */
+static char *
+expand_fmt_string(const char *fmt, ErrorData *edata)
+{
+	StringInfoData buf;
+	const char *cp;
+
+	initStringInfo(&buf);
+
+	for (cp = fmt; *cp; cp++)
+	{
+		if (cp[0] == '%' && cp[1] != '\0')
+		{
+			cp++;
+			if (*cp == 'm')
+			{
+				/*
+				 * Replace %m by system error string.  If there are any %'s in
+				 * the string, we'd better double them so that vsnprintf won't
+				 * misinterpret.
+				 */
+				const char *cp2;
+
+				cp2 = useful_strerror(edata->saved_errno);
+				for (; *cp2; cp2++)
+				{
+					if (*cp2 == '%')
+						appendStringInfoCharMacro(&buf, '%');
+					appendStringInfoCharMacro(&buf, *cp2);
+				}
+			}
+			else
+			{
+				/* copy % and next char --- this avoids trouble with %%m */
+				appendStringInfoCharMacro(&buf, '%');
+				appendStringInfoCharMacro(&buf, *cp);
+			}
+		}
+		else
+			appendStringInfoCharMacro(&buf, *cp);
+	}
+
+	return buf.data;
+}
+
+
+/*
+ * A slightly cleaned-up version of strerror()
+ */
+static const char *
+useful_strerror(int errnum)
+{
+	/* this buffer is only used if errno has a bogus value */
+	static char errorstr_buf[48];
+	const char *str;
+
+	str = strerror(errnum);
+
+	/*
+	 * Some strerror()s return an empty string for out-of-range errno. This is
+	 * ANSI C spec compliant, but not exactly useful.
+	 */
+	if (str == NULL || *str == '\0')
+	{
+		snprintf(errorstr_buf, sizeof(errorstr_buf),
+		/*------
+		  translator: This string will be truncated at 47
+		  characters expanded. */
+				 _("operating system error %d"), errnum);
+		str = errorstr_buf;
+	}
+
+	return str;
+}
+
+
+/*
+ * error_severity --- get localized string representing elevel
+ */
+static const char *
+error_severity(int elevel)
+{
+	const char *prefix;
+
+	switch (elevel)
+	{
+		case DEBUG1:
+		case DEBUG2:
+		case DEBUG3:
+		case DEBUG4:
+		case DEBUG5:
+			prefix = _("DEBUG");
+			break;
+		case LOG:
+		case COMMERROR:
+			prefix = _("LOG");
+			break;
+		case INFO:
+			prefix = _("INFO");
+			break;
+		case NOTICE:
+			prefix = _("NOTICE");
+			break;
+		case WARNING:
+			prefix = _("WARNING");
+			break;
+		case ERROR:
+			prefix = _("ERROR");
+			break;
+		case ERROR2:
+			prefix = _("ERROR2");
+			break;
+		case FATAL:
+			prefix = _("FATAL");
+			break;
+		case PANIC:
+			prefix = _("PANIC");
+			break;
+		default:
+			prefix = "???";
+			break;
+	}
+
+	return prefix;
+}
+
+
+/*
+ *	append_with_tabs
+ *
+ *	Append the string to the StringInfo buffer, inserting a tab after any
+ *	newline.
+ */
+static void
+append_with_tabs(StringInfo buf, const char *str)
+{
+	char		ch;
+
+	while ((ch = *str++) != '\0')
+	{
+		appendStringInfoCharMacro(buf, ch);
+		if (ch == '\n')
+			appendStringInfoCharMacro(buf, '\t');
+	}
+}
+
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available). Used before ereport/elog can be used
+ * safely (memory context, GUC load etc)
+ */
+void
+write_stderr(const char *fmt,...)
+{
+	va_list		ap;
+
+	fmt = _(fmt);
+
+	va_start(ap, fmt);
+
+	/* On Unix, we just fprintf to stderr */
+	vfprintf(stderr, fmt, ap);
+	fflush(stderr);
+	va_end(ap);
+}
+
+
+/*
+ * is_log_level_output -- is elevel logically >= log_min_level?
+ *
+ * We use this for tests that should consider LOG to sort out-of-order,
+ * between ERROR and FATAL.  Generally this is the right thing for testing
+ * whether a message should go to the postmaster log, whereas a simple >=
+ * test is correct for testing whether the message should go to the client.
+ */
+static bool
+is_log_level_output(int elevel, int log_min_level)
+{
+	if (elevel == LOG || elevel == COMMERROR)
+	{
+		if (log_min_level == LOG || log_min_level <= ERROR)
+			return true;
+	}
+	else if (log_min_level == LOG)
+	{
+		/* elevel != LOG */
+		if (elevel >= FATAL)
+			return true;
+	}
+	/* Neither is LOG */
+	else if (elevel >= log_min_level)
+		return true;
+
+	return false;
+}
diff --git a/src/gtm/common/gtm_list.c b/src/gtm/common/gtm_list.c
new file mode 100644
index 0000000000..3ea2ce76cb
--- /dev/null
+++ b/src/gtm/common/gtm_list.c
@@ -0,0 +1,863 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_list.c
+ *	  implementation for PostgreSQL generic linked list package
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/nodes/list.c,v 1.70 2008/08/14 18:47:58 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_list.h"
+#include "gtm/memutils.h"
+#include "gtm/assert.h"
+
+#define equal(a, b)		((a) == (b))
+
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Check that the specified List is valid (so far as we can tell).
+ */
+static void
+check_list_invariants(List *list)
+{
+	if (list == NIL)
+		return;
+
+	Assert(list->length > 0);
+	Assert(list->head != NULL);
+	Assert(list->tail != NULL);
+
+	if (list->length == 1)
+		Assert(list->head == list->tail);
+	if (list->length == 2)
+		Assert(list->head->next == list->tail);
+	Assert(list->tail->next == NULL);
+}
+#else
+#define check_list_invariants(l)
+#endif   /* USE_ASSERT_CHECKING */
+
+/*
+ * Return a freshly allocated List. Since empty non-NIL lists are
+ * invalid, new_list() also allocates the head cell of the new list:
+ * the caller should be sure to fill in that cell's data.
+ */
+static List *
+new_list()
+{
+	List	   *new_list;
+	ListCell   *new_head;
+
+	new_head = (ListCell *) palloc(sizeof(*new_head));
+	new_head->next = NULL;
+	/* new_head->data is left undefined! */
+
+	new_list = (List *) palloc(sizeof(*new_list));
+	new_list->length = 1;
+	new_list->head = new_head;
+	new_list->tail = new_head;
+
+	return new_list;
+}
+
+/*
+ * Allocate a new cell and make it the head of the specified
+ * list. Assumes the list it is passed is non-NIL.
+ *
+ * The data in the new head cell is undefined; the caller should be
+ * sure to fill it in
+ */
+static void
+new_head_cell(List *list)
+{
+	ListCell   *new_head;
+
+	new_head = (ListCell *) palloc(sizeof(*new_head));
+	new_head->next = list->head;
+
+	list->head = new_head;
+	list->length++;
+}
+
+/*
+ * Allocate a new cell and make it the tail of the specified
+ * list. Assumes the list it is passed is non-NIL.
+ *
+ * The data in the new tail cell is undefined; the caller should be
+ * sure to fill it in
+ */
+static void
+new_tail_cell(List *list)
+{
+	ListCell   *new_tail;
+
+	new_tail = (ListCell *) palloc(sizeof(*new_tail));
+	new_tail->next = NULL;
+
+	list->tail->next = new_tail;
+	list->tail = new_tail;
+	list->length++;
+}
+
+/*
+ * Append a pointer to the list. A pointer to the modified list is
+ * returned. Note that this function may or may not destructively
+ * modify the list; callers should always use this function's return
+ * value, rather than continuing to use the pointer passed as the
+ * first argument.
+ */
+List *
+lappend(List *list, void *datum)
+{
+	if (list == NIL)
+		list = new_list();
+	else
+		new_tail_cell(list);
+
+	lfirst(list->tail) = datum;
+	check_list_invariants(list);
+	return list;
+}
+
+/*
+ * Add a new cell to the list, in the position after 'prev_cell'. The
+ * data in the cell is left undefined, and must be filled in by the
+ * caller. 'list' is assumed to be non-NIL, and 'prev_cell' is assumed
+ * to be non-NULL and a member of 'list'.
+ */
+static ListCell *
+add_new_cell(List *list, ListCell *prev_cell)
+{
+	ListCell   *new_cell;
+
+	new_cell = (ListCell *) palloc(sizeof(*new_cell));
+	/* new_cell->data is left undefined! */
+	new_cell->next = prev_cell->next;
+	prev_cell->next = new_cell;
+
+	if (list->tail == prev_cell)
+		list->tail = new_cell;
+
+	list->length++;
+
+	return new_cell;
+}
+
+/*
+ * Add a new cell to the specified list (which must be non-NIL);
+ * it will be placed after the list cell 'prev' (which must be
+ * non-NULL and a member of 'list'). The data placed in the new cell
+ * is 'datum'. The newly-constructed cell is returned.
+ */
+ListCell *
+lappend_cell(List *list, ListCell *prev, void *datum)
+{
+	ListCell   *new_cell;
+
+	new_cell = add_new_cell(list, prev);
+	lfirst(new_cell) = datum;
+	check_list_invariants(list);
+	return new_cell;
+}
+
+/*
+ * Prepend a new element to the list. A pointer to the modified list
+ * is returned. Note that this function may or may not destructively
+ * modify the list; callers should always use this function's return
+ * value, rather than continuing to use the pointer passed as the
+ * second argument.
+ */
+List *
+lcons(void *datum, List *list)
+{
+	if (list == NIL)
+		list = new_list();
+	else
+		new_head_cell(list);
+
+	lfirst(list->head) = datum;
+	check_list_invariants(list);
+	return list;
+}
+
+/*
+ * Concatenate list2 to the end of list1, and return list1. list1 is
+ * destructively changed. Callers should be sure to use the return
+ * value as the new pointer to the concatenated list: the 'list1'
+ * input pointer may or may not be the same as the returned pointer.
+ *
+ * The nodes in list2 are merely appended to the end of list1 in-place
+ * (i.e. they aren't copied; the two lists will share some of the same
+ * storage). Therefore, invoking list_free() on list2 will also
+ * invalidate a portion of list1.
+ */
+List *
+list_concat(List *list1, List *list2)
+{
+	if (list1 == NIL)
+		return list2;
+	if (list2 == NIL)
+		return list1;
+	if (list1 == list2)
+		elog(ERROR, "cannot list_concat() a list to itself");
+
+
+	list1->length += list2->length;
+	list1->tail->next = list2->head;
+	list1->tail = list2->tail;
+
+	check_list_invariants(list1);
+	return list1;
+}
+
+/*
+ * Truncate 'list' to contain no more than 'new_size' elements. This
+ * modifies the list in-place! Despite this, callers should use the
+ * pointer returned by this function to refer to the newly truncated
+ * list -- it may or may not be the same as the pointer that was
+ * passed.
+ *
+ * Note that any cells removed by list_truncate() are NOT pfree'd.
+ */
+List *
+list_truncate(List *list, int new_size)
+{
+	ListCell   *cell;
+	int			n;
+
+	if (new_size <= 0)
+		return NIL;				/* truncate to zero length */
+
+	/* If asked to effectively extend the list, do nothing */
+	if (new_size >= list_length(list))
+		return list;
+
+	n = 1;
+	foreach(cell, list)
+	{
+		if (n == new_size)
+		{
+			cell->next = NULL;
+			list->tail = cell;
+			list->length = new_size;
+			check_list_invariants(list);
+			return list;
+		}
+		n++;
+	}
+
+	/* keep the compiler quiet; never reached */
+	Assert(false);
+	return list;
+}
+
+/*
+ * Locate the n'th cell (counting from 0) of the list.  It is an assertion
+ * failure if there is no such cell.
+ */
+static ListCell *
+list_nth_cell(List *list, int n)
+{
+	ListCell   *match;
+
+	Assert(list != NIL);
+	Assert(n >= 0);
+	Assert(n < list->length);
+	check_list_invariants(list);
+
+	/* Does the caller actually mean to fetch the tail? */
+	if (n == list->length - 1)
+		return list->tail;
+
+	for (match = list->head; n-- > 0; match = match->next)
+		;
+
+	return match;
+}
+
+/*
+ * Return the data value contained in the n'th element of the
+ * specified list. (List elements begin at 0.)
+ */
+void *
+list_nth(List *list, int n)
+{
+	return lfirst(list_nth_cell(list, n));
+}
+
+/*
+ * Return true iff 'datum' is a member of the list. Equality is
+ * determined via equal(), so callers should ensure that they pass a
+ * Node as 'datum'.
+ */
+bool
+list_member(List *list, void *datum)
+{
+	ListCell   *cell;
+
+	check_list_invariants(list);
+
+	foreach(cell, list)
+	{
+		if (equal(lfirst(cell), datum))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Return true iff 'datum' is a member of the list. Equality is
+ * determined by using simple pointer comparison.
+ */
+bool
+list_member_ptr(List *list, void *datum)
+{
+	ListCell   *cell;
+
+	check_list_invariants(list);
+
+	foreach(cell, list)
+	{
+		if (lfirst(cell) == datum)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Delete 'cell' from 'list'; 'prev' is the previous element to 'cell'
+ * in 'list', if any (i.e. prev == NULL iff list->head == cell)
+ *
+ * The cell is pfree'd, as is the List header if this was the last member.
+ */
+List *
+list_delete_cell(List *list, ListCell *cell, ListCell *prev)
+{
+	check_list_invariants(list);
+	Assert(prev != NULL ? lnext(prev) == cell : list_head(list) == cell);
+
+	/*
+	 * If we're about to delete the last node from the list, free the whole
+	 * list instead and return NIL, which is the only valid representation of
+	 * a zero-length list.
+	 */
+	if (list->length == 1)
+	{
+		list_free(list);
+		return NIL;
+	}
+
+	/*
+	 * Otherwise, adjust the necessary list links, deallocate the particular
+	 * node we have just removed, and return the list we were given.
+	 */
+	list->length--;
+
+	if (prev)
+		prev->next = cell->next;
+	else
+		list->head = cell->next;
+
+	if (list->tail == cell)
+		list->tail = prev;
+
+	pfree(cell);
+	return list;
+}
+
+/*
+ * Delete the first cell in list that matches datum, if any.
+ * Equality is determined via equal().
+ */
+List *
+list_delete(List *list, void *datum)
+{
+	ListCell   *cell;
+	ListCell   *prev;
+
+	check_list_invariants(list);
+
+	prev = NULL;
+	foreach(cell, list)
+	{
+		if (equal(lfirst(cell), datum))
+			return list_delete_cell(list, cell, prev);
+
+		prev = cell;
+	}
+
+	/* Didn't find a match: return the list unmodified */
+	return list;
+}
+
+/* As above, but use simple pointer equality */
+List *
+list_delete_ptr(List *list, void *datum)
+{
+	ListCell   *cell;
+	ListCell   *prev;
+
+	check_list_invariants(list);
+
+	prev = NULL;
+	foreach(cell, list)
+	{
+		if (lfirst(cell) == datum)
+			return list_delete_cell(list, cell, prev);
+
+		prev = cell;
+	}
+
+	/* Didn't find a match: return the list unmodified */
+	return list;
+}
+
+
+/*
+ * Delete the first element of the list.
+ *
+ * This is useful to replace the Lisp-y code "list = lnext(list);" in cases
+ * where the intent is to alter the list rather than just traverse it.
+ * Beware that the removed cell is freed, whereas the lnext() coding leaves
+ * the original list head intact if there's another pointer to it.
+ */
+List *
+list_delete_first(List *list)
+{
+	check_list_invariants(list);
+
+	if (list == NIL)
+		return NIL;				/* would an error be better? */
+
+	return list_delete_cell(list, list_head(list), NULL);
+}
+
+/*
+ * Generate the union of two lists. This is calculated by copying
+ * list1 via list_copy(), then adding to it all the members of list2
+ * that aren't already in list1.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ *
+ * The returned list is newly-allocated, although the content of the
+ * cells is the same (i.e. any pointed-to objects are not copied).
+ *
+ * NB: this function will NOT remove any duplicates that are present
+ * in list1 (so it only performs a "union" if list1 is known unique to
+ * start with).  Also, if you are about to write "x = list_union(x, y)"
+ * you probably want to use list_concat_unique() instead to avoid wasting
+ * the list cells of the old x list.
+ *
+ * This function could probably be implemented a lot faster if it is a
+ * performance bottleneck.
+ */
+List *
+list_union(List *list1, List *list2)
+{
+	List	   *result;
+	ListCell   *cell;
+
+	result = list_copy(list1);
+	foreach(cell, list2)
+	{
+		if (!list_member(result, lfirst(cell)))
+			result = lappend(result, lfirst(cell));
+	}
+
+	check_list_invariants(result);
+	return result;
+}
+
+/*
+ * This variant of list_union() determines duplicates via simple
+ * pointer comparison.
+ */
+List *
+list_union_ptr(List *list1, List *list2)
+{
+	List	   *result;
+	ListCell   *cell;
+
+
+	result = list_copy(list1);
+	foreach(cell, list2)
+	{
+		if (!list_member_ptr(result, lfirst(cell)))
+			result = lappend(result, lfirst(cell));
+	}
+
+	check_list_invariants(result);
+	return result;
+}
+
+/*
+ * Return a list that contains all the cells that are in both list1 and
+ * list2.  The returned list is freshly allocated via palloc(), but the
+ * cells themselves point to the same objects as the cells of the
+ * input lists.
+ *
+ * Duplicate entries in list1 will not be suppressed, so it's only a true
+ * "intersection" if list1 is known unique beforehand.
+ *
+ * This variant works on lists of pointers, and determines list
+ * membership via equal().  Note that the list1 member will be pointed
+ * to in the result.
+ */
+List *
+list_intersection(List *list1, List *list2)
+{
+	List	   *result;
+	ListCell   *cell;
+
+	if (list1 == NIL || list2 == NIL)
+		return NIL;
+
+	result = NIL;
+	foreach(cell, list1)
+	{
+		if (list_member(list2, lfirst(cell)))
+			result = lappend(result, lfirst(cell));
+	}
+
+	check_list_invariants(result);
+	return result;
+}
+
+/*
+ * Return a list that contains all the cells in list1 that are not in
+ * list2. The returned list is freshly allocated via palloc(), but the
+ * cells themselves point to the same objects as the cells of the
+ * input lists.
+ *
+ * This variant works on lists of pointers, and determines list
+ * membership via equal()
+ */
+List *
+list_difference(List *list1, List *list2)
+{
+	ListCell   *cell;
+	List	   *result = NIL;
+
+	if (list2 == NIL)
+		return list_copy(list1);
+
+	foreach(cell, list1)
+	{
+		if (!list_member(list2, lfirst(cell)))
+			result = lappend(result, lfirst(cell));
+	}
+
+	check_list_invariants(result);
+	return result;
+}
+
+/*
+ * This variant of list_difference() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_difference_ptr(List *list1, List *list2)
+{
+	ListCell   *cell;
+	List	   *result = NIL;
+
+	if (list2 == NIL)
+		return list_copy(list1);
+
+	foreach(cell, list1)
+	{
+		if (!list_member_ptr(list2, lfirst(cell)))
+			result = lappend(result, lfirst(cell));
+	}
+
+	check_list_invariants(result);
+	return result;
+}
+
+/*
+ * Append datum to list, but only if it isn't already in the list.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ */
+List *
+list_append_unique(List *list, void *datum)
+{
+	if (list_member(list, datum))
+		return list;
+	else
+		return lappend(list, datum);
+}
+
+/*
+ * This variant of list_append_unique() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_append_unique_ptr(List *list, void *datum)
+{
+	if (list_member_ptr(list, datum))
+		return list;
+	else
+		return lappend(list, datum);
+}
+
+/*
+ * Append to list1 each member of list2 that isn't already in list1.
+ *
+ * Whether an element is already a member of the list is determined
+ * via equal().
+ *
+ * This is almost the same functionality as list_union(), but list1 is
+ * modified in-place rather than being copied.	Note also that list2's cells
+ * are not inserted in list1, so the analogy to list_concat() isn't perfect.
+ */
+List *
+list_concat_unique(List *list1, List *list2)
+{
+	ListCell   *cell;
+
+	foreach(cell, list2)
+	{
+		if (!list_member(list1, lfirst(cell)))
+			list1 = lappend(list1, lfirst(cell));
+	}
+
+	check_list_invariants(list1);
+	return list1;
+}
+
+/*
+ * This variant of list_concat_unique() determines list membership via
+ * simple pointer equality.
+ */
+List *
+list_concat_unique_ptr(List *list1, List *list2)
+{
+	ListCell   *cell;
+
+	foreach(cell, list2)
+	{
+		if (!list_member_ptr(list1, lfirst(cell)))
+			list1 = lappend(list1, lfirst(cell));
+	}
+
+	check_list_invariants(list1);
+	return list1;
+}
+
+/*
+ * Free all storage in a list, and optionally the pointed-to elements
+ */
+static void
+list_free_private(List *list, bool deep)
+{
+	ListCell   *cell;
+
+	check_list_invariants(list);
+
+	cell = list_head(list);
+	while (cell != NULL)
+	{
+		ListCell   *tmp = cell;
+
+		cell = lnext(cell);
+		if (deep)
+			pfree(lfirst(tmp));
+		pfree(tmp);
+	}
+
+	if (list)
+		pfree(list);
+}
+
+/*
+ * Free all the cells of the list, as well as the list itself. Any
+ * objects that are pointed-to by the cells of the list are NOT
+ * free'd.
+ *
+ * On return, the argument to this function has been freed, so the
+ * caller would be wise to set it to NIL for safety's sake.
+ */
+void
+list_free(List *list)
+{
+	list_free_private(list, false);
+}
+
+/*
+ * Free all the cells of the list, the list itself, and all the
+ * objects pointed-to by the cells of the list (each element in the
+ * list must contain a pointer to a palloc()'d region of memory!)
+ *
+ * On return, the argument to this function has been freed, so the
+ * caller would be wise to set it to NIL for safety's sake.
+ */
+void
+list_free_deep(List *list)
+{
+	/*
+	 * A "deep" free operation only makes sense on a list of pointers.
+	 */
+	list_free_private(list, true);
+}
+
+/*
+ * Return a shallow copy of the specified list.
+ */
+List *
+list_copy(List *oldlist)
+{
+	List	   *newlist;
+	ListCell   *newlist_prev;
+	ListCell   *oldlist_cur;
+
+	if (oldlist == NIL)
+		return NIL;
+
+	newlist = new_list();
+	newlist->length = oldlist->length;
+
+	/*
+	 * Copy over the data in the first cell; new_list() has already allocated
+	 * the head cell itself
+	 */
+	newlist->head->data = oldlist->head->data;
+
+	newlist_prev = newlist->head;
+	oldlist_cur = oldlist->head->next;
+	while (oldlist_cur)
+	{
+		ListCell   *newlist_cur;
+
+		newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur));
+		newlist_cur->data = oldlist_cur->data;
+		newlist_prev->next = newlist_cur;
+
+		newlist_prev = newlist_cur;
+		oldlist_cur = oldlist_cur->next;
+	}
+
+	newlist_prev->next = NULL;
+	newlist->tail = newlist_prev;
+
+	check_list_invariants(newlist);
+	return newlist;
+}
+
+/*
+ * Return a shallow copy of the specified list, without the first N elements.
+ */
+List *
+list_copy_tail(List *oldlist, int nskip)
+{
+	List	   *newlist;
+	ListCell   *newlist_prev;
+	ListCell   *oldlist_cur;
+
+	if (nskip < 0)
+		nskip = 0;				/* would it be better to elog? */
+
+	if (oldlist == NIL || nskip >= oldlist->length)
+		return NIL;
+
+	newlist = new_list();
+	newlist->length = oldlist->length - nskip;
+
+	/*
+	 * Skip over the unwanted elements.
+	 */
+	oldlist_cur = oldlist->head;
+	while (nskip-- > 0)
+		oldlist_cur = oldlist_cur->next;
+
+	/*
+	 * Copy over the data in the first remaining cell; new_list() has already
+	 * allocated the head cell itself
+	 */
+	newlist->head->data = oldlist_cur->data;
+
+	newlist_prev = newlist->head;
+	oldlist_cur = oldlist_cur->next;
+	while (oldlist_cur)
+	{
+		ListCell   *newlist_cur;
+
+		newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur));
+		newlist_cur->data = oldlist_cur->data;
+		newlist_prev->next = newlist_cur;
+
+		newlist_prev = newlist_cur;
+		oldlist_cur = oldlist_cur->next;
+	}
+
+	newlist_prev->next = NULL;
+	newlist->tail = newlist_prev;
+
+	check_list_invariants(newlist);
+	return newlist;
+}
+
+/*
+ * When using non-GCC compilers, we can't define these as inline
+ * functions in pg_list.h, so they are defined here.
+ *
+ * TODO: investigate supporting inlining for some non-GCC compilers.
+ */
+#ifndef __GNUC__
+
+ListCell *
+list_head(List *l)
+{
+	return l ? l->head : NULL;
+}
+
+ListCell *
+list_tail(List *l)
+{
+	return l ? l->tail : NULL;
+}
+
+int
+list_length(List *l)
+{
+	return l ? l->length : 0;
+}
+#endif   /* ! __GNUC__ */
+
+/*
+ * Temporary compatibility functions
+ *
+ * In order to avoid warnings for these function definitions, we need
+ * to include a prototype here as well as in pg_list.h. That's because
+ * we don't enable list API compatibility in list.c, so we
+ * don't see the prototypes for these functions.
+ */
+
+/*
+ * Given a list, return its length. This is merely defined for the
+ * sake of backward compatibility: we can't afford to define a macro
+ * called "length", so it must be a function. New code should use the
+ * list_length() macro in order to avoid the overhead of a function
+ * call.
+ */
+int			length(List *list);
+
+int
+length(List *list)
+{
+	return list_length(list);
+}
diff --git a/src/gtm/common/gtm_lock.c b/src/gtm/common/gtm_lock.c
new file mode 100644
index 0000000000..c919730c90
--- /dev/null
+++ b/src/gtm/common/gtm_lock.c
@@ -0,0 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_lock.c
+ *	Handling for locks in GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/elog.h"
+
+/*
+ * Acquire the request lock. Block if the lock is not available
+ *
+ * TODO We should track the locks acquired in the thread specific context. If an
+ * error is thrown and cought, we don't want to keep holding to those locks
+ * since that would lead to a deadlock. Right now, we assume that the caller
+ * will appropriately catch errors and release the locks sanely.
+ */
+bool
+GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode)
+{
+	int status;
+
+	switch (mode)
+	{
+		case GTM_LOCKMODE_WRITE:
+			status = pthread_rwlock_wrlock(&lock->lk_lock);
+			break;
+
+		case GTM_LOCKMODE_READ:
+			status = pthread_rwlock_rdlock(&lock->lk_lock);
+			break;
+
+		default:
+			elog(ERROR, "Invalid lockmode");
+			break;
+	}
+
+	return status ? false : true;
+}
+
+/*
+ * Release previously acquired lock
+ */
+bool
+GTM_RWLockRelease(GTM_RWLock *lock)
+{
+	int status;
+	status = pthread_rwlock_unlock(&lock->lk_lock);
+	return status ? false : true;
+}
+
+/*
+ * Initialize a lock
+ */
+int
+GTM_RWLockInit(GTM_RWLock *lock)
+{
+	return pthread_rwlock_init(&lock->lk_lock, NULL);
+}
+
+/*
+ * Destroy a lock
+ */
+int
+GTM_RWLockDestroy(GTM_RWLock *lock)
+{
+	return pthread_rwlock_destroy(&lock->lk_lock);
+}
+
+/*
+ * Conditionally acquire a lock. If the lock is not available, the function
+ * immediately returns without blocking.
+ *
+ * Returns true if lock is successfully acquired. Otherwise returns false
+ */
+bool
+GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode)
+{
+	int status;
+
+	switch (mode)
+	{
+		case GTM_LOCKMODE_WRITE:
+			status = pthread_rwlock_trywrlock(&lock->lk_lock);
+			break;
+
+		case GTM_LOCKMODE_READ:
+			status = pthread_rwlock_tryrdlock(&lock->lk_lock);
+			break;
+
+		default:
+			elog(ERROR, "Invalid lockmode");
+			break;
+	}
+
+	return status ? false : true;
+}
+
+/*
+ * Initialize a mutex lock
+ */
+int
+GTM_MutexLockInit(GTM_MutexLock *lock)
+{
+	return pthread_mutex_init(&lock->lk_lock, NULL);
+}
+
+/*
+ * Destroy a mutex lock
+ */
+int
+GTM_MutexLockDestroy(GTM_MutexLock *lock)
+{
+	return pthread_mutex_destroy(&lock->lk_lock);
+}
+
+/*
+ * Acquire a mutex lock
+ *
+ * Return true if the lock is successfully acquired, else return false.
+ */
+bool
+GTM_MutexLockAcquire(GTM_MutexLock *lock)
+{
+	int status = pthread_mutex_lock(&lock->lk_lock);
+	return status ? false : true;
+}
+
+/*
+ * Release previously acquired lock
+ */
+bool
+GTM_MutexLockRelease(GTM_MutexLock *lock)
+{
+	return pthread_mutex_unlock(&lock->lk_lock);
+}
+
+/*
+ * Conditionally acquire a lock. If the lock is not available, the function
+ * immediately returns without blocking.
+ *
+ * Returns true if lock is successfully acquired. Otherwise returns false
+ */
+bool
+GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock)
+{
+	int status = pthread_mutex_trylock(&lock->lk_lock); 
+	return status ? false : true;
+}
+
+/*
+ * Initialize a condition variable
+ */
+int
+GTM_CVInit(GTM_CV *cv)
+{
+	return pthread_cond_init(&cv->cv_condvar, NULL);
+}
+
+/*
+ * Destroy the conditional variable
+ */
+int
+GTM_CVDestroy(GTM_CV *cv)
+{
+	return pthread_cond_destroy(&cv->cv_condvar);
+}
+
+/*
+ * Wake up all the threads waiting on this conditional variable
+ */
+int
+GTM_CVBcast(GTM_CV *cv)
+{
+	return pthread_cond_broadcast(&cv->cv_condvar);
+}
+
+/*
+ * Wake up only one thread waiting on this conditional variable
+ */
+int
+GTM_CVSignal(GTM_CV *cv)
+{
+	return pthread_cond_signal(&cv->cv_condvar);
+}
+
+/*
+ * Wait on a conditional variable. The caller must have acquired the mutex lock
+ * already.
+ */
+int
+GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock)
+{
+	return pthread_cond_wait(&cv->cv_condvar, &lock->lk_lock);
+}
+
diff --git a/src/gtm/common/mcxt.c b/src/gtm/common/mcxt.c
new file mode 100644
index 0000000000..9325ae3c82
--- /dev/null
+++ b/src/gtm/common/mcxt.c
@@ -0,0 +1,763 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcxt.c
+ *	  POSTGRES memory context management code.
+ *
+ * This module handles context management operations that are independent
+ * of the particular kind of context being operated on.  It calls
+ * context-type-specific operations via the function pointers in a
+ * context's MemoryContextMethods struct.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/mcxt.c,v 1.65 2008/06/28 16:45:22 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+
+#include "gtm/gtm_c.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+
+/*****************************************************************************
+ *	  GLOBAL MEMORY															 *
+ *****************************************************************************/
+
+/*
+ * Standard top-level contexts. For a description of the purpose of each
+ * of these contexts, refer to src/backend/utils/mmgr/README
+ */
+
+static void MemoryContextStatsInternal(MemoryContext context, int level);
+static void MemoryContextDeleteInternal(MemoryContext context, bool parent_locked);
+
+MemoryContext	TopMostMemoryContext;
+
+/*****************************************************************************
+ *	  EXPORTED ROUTINES														 *
+ *****************************************************************************/
+
+
+/*
+ * MemoryContextInit
+ *		Start up the memory-context subsystem.
+ *
+ * This must be called before creating contexts or allocating memory in
+ * contexts.  TopMemoryContext and ErrorContext are initialized here;
+ * other contexts must be created afterwards.
+ *
+ * In normal multi-backend operation, this is called once during
+ * postmaster startup, and not at all by individual backend startup
+ * (since the backends inherit an already-initialized context subsystem
+ * by virtue of being forked off the postmaster).
+ *
+ * In a standalone backend this must be called during backend startup.
+ */
+void
+MemoryContextInit(void)
+{
+	AssertState(TopMemoryContext == NULL);
+
+	/*
+	 * Initialize TopMemoryContext as an AllocSetContext with slow growth rate
+	 * --- we don't really expect much to be allocated in it.
+	 *
+	 * (There is special-case code in MemoryContextCreate() for this call.)
+	 *
+	 * This context is shared between different threads and must be made
+	 * thread-safe
+	 */
+	TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL,
+											 "TopMemoryContext",
+											 0,
+											 8 * 1024,
+											 8 * 1024,
+											 true);
+
+	TopMostMemoryContext = TopMemoryContext;
+
+	/*
+	 * Not having any other place to point CurrentMemoryContext, make it point
+	 * to TopMemoryContext.  Caller should change this soon!
+	 */
+	CurrentMemoryContext = TopMemoryContext;
+
+	/*
+	 * Initialize ErrorContext as an AllocSetContext with slow growth rate ---
+	 * we don't really expect much to be allocated in it. More to the point,
+	 * require it to contain at least 8K at all times. This is the only case
+	 * where retained memory in a context is *essential* --- we want to be
+	 * sure ErrorContext still has some memory even if we've run out
+	 * elsewhere!
+	 *
+	 * Similar to TopMostMemoryContext, this context may as well be shared
+	 * between threads
+	 */
+	ErrorContext = AllocSetContextCreate(TopMemoryContext,
+										 "ErrorContext",
+										 8 * 1024,
+										 8 * 1024,
+										 8 * 1024,
+										 true);
+}
+
+/*
+ * MemoryContextReset
+ *		Release all space allocated within a context and its descendants,
+ *		but don't delete the contexts themselves.
+ *
+ * The type-specific reset routine handles the context itself, but we
+ * have to do the recursion for the children.
+ */
+void
+MemoryContextReset(MemoryContext context)
+{
+	AssertArg(MemoryContextIsValid(context));
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+	
+	/* save a function call in common case where there are no children */
+	if (context->firstchild != NULL)
+		MemoryContextResetChildren(context);
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+
+	(*context->methods->reset) (context);
+}
+
+/*
+ * MemoryContextResetChildren
+ *		Release all space allocated within a context's descendants,
+ *		but don't delete the contexts themselves.  The named context
+ *		itself is not touched.
+ */
+void
+MemoryContextResetChildren(MemoryContext context)
+{
+	MemoryContext child;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	/*
+	 * For a shared context, lock the parent context before resetting the
+	 * children contextes
+	 */
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+		MemoryContextReset(child);
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+}
+
+/*
+ * MemoryContextDelete
+ *		Delete a context and its descendants, and release all space
+ *		allocated therein.
+ *
+ * The type-specific delete routine removes all subsidiary storage
+ * for the context, but we have to delete the context node itself,
+ * as well as recurse to get the children.	We must also delink the
+ * node from its parent, if it has one.
+ */
+static void
+MemoryContextDeleteInternal(MemoryContext context, bool parent_locked)
+{
+	AssertArg(MemoryContextIsValid(context));
+	/* We had better not be deleting TopMemoryContext ... */
+	Assert(context != TopMemoryContext);
+	/* And not CurrentMemoryContext, either */
+	Assert(context != CurrentMemoryContext);
+
+	MemoryContextDeleteChildren(context);
+
+	/*
+	 * We delink the context from its parent before deleting it, so that if
+	 * there's an error we won't have deleted/busted contexts still attached
+	 * to the context tree.  Better a leak than a crash.
+	 */
+	if (context->parent)
+	{
+		MemoryContext parent = context->parent;
+
+		/*
+		 * If the parent context is shared and is already locked by the caller,
+		 * no need to relock again. In fact, that's not the right thing to do
+		 * since it will lead to a self-deadlock
+		 */
+		if (MemoryContextIsShared(parent) && (!parent_locked))
+			MemoryContextLock(parent);
+
+		if (context == parent->firstchild)
+			parent->firstchild = context->nextchild;
+		else
+		{
+			MemoryContext child;
+
+			for (child = parent->firstchild; child; child = child->nextchild)
+			{
+				if (context == child->nextchild)
+				{
+					child->nextchild = context->nextchild;
+					break;
+				}
+			}
+		}
+
+		if (MemoryContextIsShared(parent) && (!parent_locked))
+			MemoryContextUnlock(parent);
+	}
+	(*context->methods->delete) (context);
+	pfree(context);
+}
+
+void
+MemoryContextDelete(MemoryContext context)
+{
+	MemoryContextDeleteInternal(context, false);
+}
+
+/*
+ * MemoryContextDeleteChildren
+ *		Delete all the descendants of the named context and release all
+ *		space allocated therein.  The named context itself is not touched.
+ */
+void
+MemoryContextDeleteChildren(MemoryContext context)
+{
+	AssertArg(MemoryContextIsValid(context));
+
+	if (MemoryContextIsShared(context))
+		MemoryContextLock(context);
+	/*
+	 * MemoryContextDelete will delink the child from me, so just iterate as
+	 * long as there is a child.
+	 *
+	 * Since the parent is already locked, pass that information to the child
+	 * which would then not attempt to relock the parent
+	 */
+	while (context->firstchild != NULL)
+		MemoryContextDeleteInternal(context->firstchild, true);
+
+	if (MemoryContextIsShared(context))
+		MemoryContextUnlock(context);
+}
+
+/*
+ * MemoryContextResetAndDeleteChildren
+ *		Release all space allocated within a context and delete all
+ *		its descendants.
+ *
+ * This is a common combination case where we want to preserve the
+ * specific context but get rid of absolutely everything under it.
+ */
+void
+MemoryContextResetAndDeleteChildren(MemoryContext context)
+{
+	AssertArg(MemoryContextIsValid(context));
+
+	MemoryContextDeleteChildren(context);
+	(*context->methods->reset) (context);
+}
+
+/*
+ * GetMemoryChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ *
+ * This is useful for measuring the total space occupied by a set of
+ * allocated chunks.
+ */
+Size
+GetMemoryChunkSpace(void *pointer)
+{
+	StandardChunkHeader *header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer != NULL);
+	Assert(pointer == (void *) MAXALIGN(pointer));
+
+	/*
+	 * OK, it's probably safe to look at the chunk header.
+	 */
+	header = (StandardChunkHeader *)
+		((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+	AssertArg(MemoryContextIsValid(header->context));
+
+	return (*header->context->methods->get_chunk_space) (header->context,
+														 pointer);
+}
+
+/*
+ * GetMemoryChunkContext
+ *		Given a currently-allocated chunk, determine the context
+ *		it belongs to.
+ */
+MemoryContext
+GetMemoryChunkContext(void *pointer)
+{
+	StandardChunkHeader *header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer != NULL);
+	Assert(pointer == (void *) MAXALIGN(pointer));
+
+	/*
+	 * OK, it's probably safe to look at the chunk header.
+	 */
+	header = (StandardChunkHeader *)
+		((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+	AssertArg(MemoryContextIsValid(header->context));
+
+	return header->context;
+}
+
+/*
+ * MemoryContextIsEmpty
+ *		Is a memory context empty of any allocated space?
+ */
+bool
+MemoryContextIsEmpty(MemoryContext context)
+{
+	AssertArg(MemoryContextIsValid(context));
+
+	/*
+	 * For now, we consider a memory context nonempty if it has any children;
+	 * perhaps this should be changed later.
+	 */
+	if (context->firstchild != NULL)
+		return false;
+	/* Otherwise use the type-specific inquiry */
+	return (*context->methods->is_empty) (context);
+}
+
+/*
+ * MemoryContextStats
+ *		Print statistics about the named context and all its descendants.
+ *
+ * This is just a debugging utility, so it's not fancy.  The statistics
+ * are merely sent to stderr.
+ */
+void
+MemoryContextStats(MemoryContext context)
+{
+	MemoryContextStatsInternal(context, 0);
+}
+
+static void
+MemoryContextStatsInternal(MemoryContext context, int level)
+{
+	MemoryContext child;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	(*context->methods->stats) (context, level);
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+		MemoryContextStatsInternal(child, level + 1);
+}
+
+/*
+ * MemoryContextCheck
+ *		Check all chunks in the named context.
+ *
+ * This is just a debugging utility, so it's not fancy.
+ */
+#ifdef MEMORY_CONTEXT_CHECKING
+void
+MemoryContextCheck(MemoryContext context)
+{
+	MemoryContext child;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	(*context->methods->check) (context);
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+		MemoryContextCheck(child);
+}
+#endif
+
+/*
+ * MemoryContextContains
+ *		Detect whether an allocated chunk of memory belongs to a given
+ *		context or not.
+ *
+ * Caution: this test is reliable as long as 'pointer' does point to
+ * a chunk of memory allocated from *some* context.  If 'pointer' points
+ * at memory obtained in some other way, there is a small chance of a
+ * false-positive result, since the bits right before it might look like
+ * a valid chunk header by chance.
+ */
+bool
+MemoryContextContains(MemoryContext context, void *pointer)
+{
+	StandardChunkHeader *header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	if (pointer == NULL || pointer != (void *) MAXALIGN(pointer))
+		return false;
+
+	/*
+	 * OK, it's probably safe to look at the chunk header.
+	 */
+	header = (StandardChunkHeader *)
+		((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+	/*
+	 * If the context link doesn't match then we certainly have a non-member
+	 * chunk.  Also check for a reasonable-looking size as extra guard against
+	 * being fooled by bogus pointers.
+	 */
+	if (header->context == context && AllocSizeIsValid(header->size))
+		return true;
+	return false;
+}
+
+/*--------------------
+ * MemoryContextCreate
+ *		Context-type-independent part of context creation.
+ *
+ * This is only intended to be called by context-type-specific
+ * context creation routines, not by the unwashed masses.
+ *
+ * The context creation procedure is a little bit tricky because
+ * we want to be sure that we don't leave the context tree invalid
+ * in case of failure (such as insufficient memory to allocate the
+ * context node itself).  The procedure goes like this:
+ *	1.	Context-type-specific routine first calls MemoryContextCreate(),
+ *		passing the appropriate tag/size/methods values (the methods
+ *		pointer will ordinarily point to statically allocated data).
+ *		The parent and name parameters usually come from the caller.
+ *	2.	MemoryContextCreate() attempts to allocate the context node,
+ *		plus space for the name.  If this fails we can ereport() with no
+ *		damage done.
+ *	3.	We fill in all of the type-independent MemoryContext fields.
+ *	4.	We call the type-specific init routine (using the methods pointer).
+ *		The init routine is required to make the node minimally valid
+ *		with zero chance of failure --- it can't allocate more memory,
+ *		for example.
+ *	5.	Now we have a minimally valid node that can behave correctly
+ *		when told to reset or delete itself.  We link the node to its
+ *		parent (if any), making the node part of the context tree.
+ *	6.	We return to the context-type-specific routine, which finishes
+ *		up type-specific initialization.  This routine can now do things
+ *		that might fail (like allocate more memory), so long as it's
+ *		sure the node is left in a state that delete will handle.
+ *
+ * This protocol doesn't prevent us from leaking memory if step 6 fails
+ * during creation of a top-level context, since there's no parent link
+ * in that case.  However, if you run out of memory while you're building
+ * a top-level context, you might as well go home anyway...
+ *
+ * Normally, the context node and the name are allocated from
+ * TopMemoryContext (NOT from the parent context, since the node must
+ * survive resets of its parent context!).	However, this routine is itself
+ * used to create TopMemoryContext!  If we see that TopMemoryContext is NULL,
+ * we assume we are creating TopMemoryContext and use malloc() to allocate
+ * the node.
+ *
+ * Note that the name field of a MemoryContext does not point to
+ * separately-allocated storage, so it should not be freed at context
+ * deletion.
+ *--------------------
+ */
+MemoryContext
+MemoryContextCreate(Size size,
+					MemoryContextMethods *methods,
+					MemoryContext parent,
+					const char *name)
+{
+	MemoryContext node;
+	Size		needed = size + strlen(name) + 1;
+
+
+	/* Get space for node and name */
+	if (TopMemoryContext != NULL)
+	{
+		/* Normal case: allocate the node in TopMemoryContext */
+		node = (MemoryContext) MemoryContextAlloc(TopMemoryContext,
+												  needed);
+	}
+	else
+	{
+		/* Special case for startup: use good ol' malloc */
+		node = (MemoryContext) malloc(needed);
+		Assert(node != NULL);
+	}
+
+	/* Initialize the node as best we can */
+	MemSet(node, 0, size);
+	node->methods = methods;
+	node->parent = NULL;		/* for the moment */
+	node->firstchild = NULL;
+	node->nextchild = NULL;
+	node->name = ((char *) node) + size;
+	strcpy(node->name, name);
+
+	/* Type-specific routine finishes any other essential initialization */
+	(*node->methods->init) (node);
+
+	/*
+	 * Lock the parent context if the it is shared and must be made thread-safe
+	 */
+	if ((parent != NULL) && (MemoryContextIsShared(parent)))
+		MemoryContextLock(parent);
+
+	/* OK to link node to parent (if any) */
+	if (parent)
+	{
+		node->parent = parent;
+		node->nextchild = parent->firstchild;
+		parent->firstchild = node;
+	}
+
+	if ((parent != NULL) && (MemoryContextIsShared(parent)))
+		MemoryContextUnlock(parent);
+
+	/* Return to type-specific creation routine to finish up */
+	return node;
+}
+
+/*
+ * MemoryContextAlloc
+ *		Allocate space within the specified context.
+ *
+ * This could be turned into a macro, but we'd have to import
+ * nodes/memnodes.h into postgres.h which seems a bad idea.
+ */
+void *
+MemoryContextAlloc(MemoryContext context, Size size)
+{
+	AssertArg(MemoryContextIsValid(context));
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %lu",
+			 (unsigned long) size);
+
+	return (*context->methods->alloc) (context, size);
+}
+
+/*
+ * MemoryContextAllocZero
+ *		Like MemoryContextAlloc, but clears allocated memory
+ *
+ *	We could just call MemoryContextAlloc then clear the memory, but this
+ *	is a very common combination, so we provide the combined operation.
+ */
+void *
+MemoryContextAllocZero(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %lu",
+			 (unsigned long) size);
+
+	ret = (*context->methods->alloc) (context, size);
+
+	MemSetAligned(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * MemoryContextAllocZeroAligned
+ *		MemoryContextAllocZero where length is suitable for MemSetLoop
+ *
+ *	This might seem overly specialized, but it's not because newNode()
+ *	is so often called with compile-time-constant sizes.
+ */
+void *
+MemoryContextAllocZeroAligned(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %lu",
+			 (unsigned long) size);
+
+	ret = (*context->methods->alloc) (context, size);
+
+	MemSetLoop(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * pfree
+ *		Release an allocated chunk.
+ */
+void
+pfree(void *pointer)
+{
+	StandardChunkHeader *header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer != NULL);
+	Assert(pointer == (void *) MAXALIGN(pointer));
+
+	/*
+	 * OK, it's probably safe to look at the chunk header.
+	 */
+	header = (StandardChunkHeader *)
+		((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+	AssertArg(MemoryContextIsValid(header->context));
+
+	(*header->context->methods->free_p) (header->context, pointer);
+}
+
+/*
+ * repalloc
+ *		Adjust the size of a previously allocated chunk.
+ */
+void *
+repalloc(void *pointer, Size size)
+{
+	StandardChunkHeader *header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer != NULL);
+	Assert(pointer == (void *) MAXALIGN(pointer));
+
+	/*
+	 * OK, it's probably safe to look at the chunk header.
+	 */
+	header = (StandardChunkHeader *)
+		((char *) pointer - STANDARDCHUNKHEADERSIZE);
+
+	AssertArg(MemoryContextIsValid(header->context));
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %lu",
+			 (unsigned long) size);
+
+	return (*header->context->methods->realloc) (header->context,
+												 pointer, size);
+}
+
+/*
+ * MemoryContextSwitchTo
+ *		Returns the current context; installs the given context.
+ *
+ * This is inlined when using GCC.
+ *
+ * TODO: investigate supporting inlining for some non-GCC compilers.
+ */
+MemoryContext
+MemoryContextSwitchTo(MemoryContext context)
+{
+	MemoryContext old;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	old = CurrentMemoryContext;
+	CurrentMemoryContext = context;
+	return old;
+}
+
+/*
+ * MemoryContextStrdup
+ *		Like strdup(), but allocate from the specified context
+ */
+char *
+MemoryContextStrdup(MemoryContext context, const char *string)
+{
+	char	   *nstr;
+	Size		len = strlen(string) + 1;
+
+	nstr = (char *) MemoryContextAlloc(context, len);
+
+	memcpy(nstr, string, len);
+
+	return nstr;
+}
+
+/*
+ * pnstrdup
+ *		Like pstrdup(), but append null byte to a
+ *		not-necessarily-null-terminated input string.
+ */
+char *
+pnstrdup(const char *in, Size len)
+{
+	char	   *out = palloc(len + 1);
+
+	memcpy(out, in, len);
+	out[len] = '\0';
+	return out;
+}
+
+
+#if defined(WIN32) || defined(__CYGWIN__)
+/*
+ *	Memory support routines for libpgport on Win32
+ *
+ *	Win32 can't load a library that PGDLLIMPORTs a variable
+ *	if the link object files also PGDLLIMPORT the same variable.
+ *	For this reason, libpgport can't reference CurrentMemoryContext
+ *	in the palloc macro calls.
+ *
+ *	To fix this, we create several functions here that allow us to
+ *	manage memory without doing the inline in libpgport.
+ */
+void *
+pgport_palloc(Size sz)
+{
+	return palloc(sz);
+}
+
+
+char *
+pgport_pstrdup(const char *str)
+{
+	return pstrdup(str);
+}
+
+
+/* Doesn't reference a PGDLLIMPORT variable, but here for completeness. */
+void
+pgport_pfree(void *pointer)
+{
+	pfree(pointer);
+}
+
+#endif
diff --git a/src/gtm/common/stringinfo.c b/src/gtm/common/stringinfo.c
new file mode 100644
index 0000000000..5023bd9893
--- /dev/null
+++ b/src/gtm/common/stringinfo.c
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * stringinfo.c
+ *
+ * StringInfo provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data.  All storage is allocated with palloc().
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *	  $PostgreSQL: pgsql/src/backend/lib/stringinfo.c,v 1.49 2008/01/01 19:45:49 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/stringinfo.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+#include "gtm/assert.h"
+#include "gtm/gtm.h"
+
+
+/*
+ * makeStringInfo
+ *
+ * Create an empty 'StringInfoData' & return a pointer to it.
+ */
+StringInfo
+makeStringInfo(void)
+{
+	StringInfo	res;
+
+	res = (StringInfo) palloc(sizeof(StringInfoData));
+
+	initStringInfo(res);
+
+	return res;
+}
+
+/*
+ * initStringInfo
+ *
+ * Initialize a StringInfoData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+void
+initStringInfo(StringInfo str)
+{
+	int			size = 1024;	/* initial default buffer size */
+
+	str->data = (char *) palloc(size);
+	str->maxlen = size;
+	resetStringInfo(str);
+}
+
+/*
+ * resetStringInfo
+ *
+ * Reset the StringInfo: the data buffer remains valid, but its
+ * previous content, if any, is cleared.
+ */
+void
+resetStringInfo(StringInfo str)
+{
+	str->data[0] = '\0';
+	str->len = 0;
+	str->cursor = 0;
+}
+
+/*
+ * appendStringInfo
+ *
+ * Format text data under the control of fmt (an sprintf-style format string)
+ * and append it to whatever is already in str.  More space is allocated
+ * to str if necessary.  This is sort of like a combination of sprintf and
+ * strcat.
+ */
+void
+appendStringInfo(StringInfo str, const char *fmt,...)
+{
+	for (;;)
+	{
+		va_list		args;
+		bool		success;
+
+		/* Try to format the data. */
+		va_start(args, fmt);
+		success = appendStringInfoVA(str, fmt, args);
+		va_end(args);
+
+		if (success)
+			break;
+
+		/* Double the buffer size and try again. */
+		enlargeStringInfo(str, str->maxlen);
+	}
+}
+
+/*
+ * appendStringInfoVA
+ *
+ * Attempt to format text data under the control of fmt (an sprintf-style
+ * format string) and append it to whatever is already in str.	If successful
+ * return true; if not (because there's not enough space), return false
+ * without modifying str.  Typically the caller would enlarge str and retry
+ * on false return --- see appendStringInfo for standard usage pattern.
+ *
+ * XXX This API is ugly, but there seems no alternative given the C spec's
+ * restrictions on what can portably be done with va_list arguments: you have
+ * to redo va_start before you can rescan the argument list, and we can't do
+ * that from here.
+ */
+bool
+appendStringInfoVA(StringInfo str, const char *fmt, va_list args)
+{
+	int			avail,
+				nprinted;
+
+	Assert(str != NULL);
+
+	/*
+	 * If there's hardly any space, don't bother trying, just fail to make the
+	 * caller enlarge the buffer first.
+	 */
+	avail = str->maxlen - str->len - 1;
+	if (avail < 16)
+		return false;
+
+	/*
+	 * Assert check here is to catch buggy vsnprintf that overruns the
+	 * specified buffer length.  Solaris 7 in 64-bit mode is an example of a
+	 * platform with such a bug.
+	 */
+#ifdef USE_ASSERT_CHECKING
+	str->data[str->maxlen - 1] = '\0';
+#endif
+
+	nprinted = vsnprintf(str->data + str->len, avail, fmt, args);
+
+	Assert(str->data[str->maxlen - 1] == '\0');
+
+	/*
+	 * Note: some versions of vsnprintf return the number of chars actually
+	 * stored, but at least one returns -1 on failure. Be conservative about
+	 * believing whether the print worked.
+	 */
+	if (nprinted >= 0 && nprinted < avail - 1)
+	{
+		/* Success.  Note nprinted does not include trailing null. */
+		str->len += nprinted;
+		return true;
+	}
+
+	/* Restore the trailing null so that str is unmodified. */
+	str->data[str->len] = '\0';
+	return false;
+}
+
+/*
+ * appendStringInfoString
+ *
+ * Append a null-terminated string to str.
+ * Like appendStringInfo(str, "%s", s) but faster.
+ */
+void
+appendStringInfoString(StringInfo str, const char *s)
+{
+	appendBinaryStringInfo(str, s, strlen(s));
+}
+
+/*
+ * appendStringInfoChar
+ *
+ * Append a single byte to str.
+ * Like appendStringInfo(str, "%c", ch) but much faster.
+ */
+void
+appendStringInfoChar(StringInfo str, char ch)
+{
+	/* Make more room if needed */
+	if (str->len + 1 >= str->maxlen)
+		enlargeStringInfo(str, 1);
+
+	/* OK, append the character */
+	str->data[str->len] = ch;
+	str->len++;
+	str->data[str->len] = '\0';
+}
+
+/*
+ * appendBinaryStringInfo
+ *
+ * Append arbitrary binary data to a StringInfo, allocating more space
+ * if necessary.
+ */
+void
+appendBinaryStringInfo(StringInfo str, const char *data, int datalen)
+{
+	Assert(str != NULL);
+
+	/* Make more room if needed */
+	enlargeStringInfo(str, datalen);
+
+	/* OK, append the data */
+	memcpy(str->data + str->len, data, datalen);
+	str->len += datalen;
+
+	/*
+	 * Keep a trailing null in place, even though it's probably useless for
+	 * binary data...
+	 */
+	str->data[str->len] = '\0';
+}
+
+/*
+ * enlargeStringInfo
+ *
+ * Make sure there is enough space for 'needed' more bytes
+ * ('needed' does not include the terminating null).
+ *
+ * External callers usually need not concern themselves with this, since
+ * all stringinfo.c routines do it automatically.  However, if a caller
+ * knows that a StringInfo will eventually become X bytes large, it
+ * can save some palloc overhead by enlarging the buffer before starting
+ * to store data in it.
+ *
+ * NB: because we use repalloc() to enlarge the buffer, the string buffer
+ * will remain allocated in the same memory context that was current when
+ * initStringInfo was called, even if another context is now current.
+ * This is the desired and indeed critical behavior!
+ */
+void
+enlargeStringInfo(StringInfo str, int needed)
+{
+	int			newlen;
+
+	/*
+	 * Guard against out-of-range "needed" values.	Without this, we can get
+	 * an overflow or infinite loop in the following.
+	 */
+	if (needed < 0)				/* should not happen */
+		elog(ERROR, "invalid string enlargement request size: %d", needed);
+	if (((Size) needed) >= (MaxAllocSize - (Size) str->len))
+		ereport(ERROR,
+				(ENOSPC,
+				 errmsg("out of memory"),
+				 errdetail("Cannot enlarge string buffer containing %d bytes by %d more bytes.",
+						   str->len, needed)));
+
+	needed += str->len + 1;		/* total space required now */
+
+	/* Because of the above test, we now have needed <= MaxAllocSize */
+
+	if (needed <= str->maxlen)
+		return;					/* got enough space already */
+
+	/*
+	 * We don't want to allocate just a little more space with each append;
+	 * for efficiency, double the buffer size each time it overflows.
+	 * Actually, we might need to more than double it if 'needed' is big...
+	 */
+	newlen = 2 * str->maxlen;
+	while (needed > newlen)
+		newlen = 2 * newlen;
+
+	/*
+	 * Clamp to MaxAllocSize in case we went past it.  Note we are assuming
+	 * here that MaxAllocSize <= INT_MAX/2, else the above loop could
+	 * overflow.  We will still have newlen >= needed.
+	 */
+	if (newlen > (int) MaxAllocSize)
+		newlen = (int) MaxAllocSize;
+
+	str->data = (char *) repalloc(str->data, newlen);
+
+	str->maxlen = newlen;
+}
diff --git a/src/gtm/gtm_ctl/Makefile b/src/gtm/gtm_ctl/Makefile
new file mode 100644
index 0000000000..eddcc9aebe
--- /dev/null
+++ b/src/gtm/gtm_ctl/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=gtm_ctl.o ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm_ctl:$(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_ctl
+
+all:gtm_ctl
+
+clean:
+	rm -f $(OBJS)
+	rm -f gtm_ctl
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
new file mode 100644
index 0000000000..3b01796484
--- /dev/null
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -0,0 +1,918 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_ctl --- start/stops/restarts the GTM server/proxy
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+
+#include <locale.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include "libpq/pqsignal.h"
+
+/* PID can be negative for standalone backend */
+typedef long pgpid_t;
+
+typedef enum
+{
+	SMART_MODE,
+	FAST_MODE,
+	IMMEDIATE_MODE
+} ShutdownMode;
+
+
+typedef enum
+{
+	NO_COMMAND = 0,
+	START_COMMAND,
+	STOP_COMMAND,
+	RESTART_COMMAND,
+} CtlCommand;
+
+#define DEFAULT_WAIT	60
+
+static bool do_wait = false;
+static bool wait_set = false;
+static int	wait_seconds = DEFAULT_WAIT;
+static bool silent_mode = false;
+static ShutdownMode shutdown_mode = SMART_MODE;
+static int	sig = SIGTERM;		/* default */
+static CtlCommand ctl_command = NO_COMMAND;
+static char *gtm_data = NULL;
+static char *gtmdata_opt = NULL;
+static char *gtm_opts = NULL;
+static const char *progname;
+static char *log_file = NULL;
+static char *gtm_path = NULL;
+static char *gtm_app = NULL;
+static char *argv0 = NULL;
+
+static void
+write_stderr(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+static void *pg_malloc(size_t size);
+static char *xstrdup(const char *s);
+static void do_advice(void);
+static void do_help(void);
+static void set_mode(char *modeopt);
+static void do_start(void);
+static void do_stop(void);
+static void do_restart(void);
+static void print_msg(const char *msg);
+
+static pgpid_t get_pgpid(void);
+static char **readfile(const char *path);
+static int	start_gtm(void);
+static void read_gtm_opts(void);
+
+static bool test_gtm_connection();
+static bool gtm_is_alive(pid_t pid);
+
+static char gtmopts_file[MAXPGPATH];
+static char pid_file[MAXPGPATH];
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available).
+ */
+static void
+write_stderr(const char *fmt,...)
+{
+	va_list		ap;
+
+	va_start(ap, fmt);
+
+	/* On Unix, we just fprintf to stderr */
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * routines to check memory allocations and fail noisily.
+ */
+
+static void *
+pg_malloc(size_t size)
+{
+	void	   *result;
+
+	result = malloc(size);
+	if (!result)
+	{
+		write_stderr(_("%s: out of memory\n"), progname);
+		exit(1);
+	}
+	return result;
+}
+
+
+static char *
+xstrdup(const char *s)
+{
+	char	   *result;
+
+	result = strdup(s);
+	if (!result)
+	{
+		write_stderr(_("%s: out of memory\n"), progname);
+		exit(1);
+	}
+	return result;
+}
+
+/*
+ * Given an already-localized string, print it to stdout unless the
+ * user has specified that no messages should be printed.
+ */
+static void
+print_msg(const char *msg)
+{
+	if (!silent_mode)
+	{
+		fputs(msg, stdout);
+		fflush(stdout);
+	}
+}
+
+static pgpid_t
+get_pgpid(void)
+{
+	FILE	   *pidf;
+	long		pid;
+
+	pidf = fopen(pid_file, "r");
+	if (pidf == NULL)
+	{
+		/* No pid file, not an error on startup */
+		if (errno == ENOENT)
+			return 0;
+		else
+		{
+			write_stderr(_("%s: could not open PID file \"%s\": %s\n"),
+						 progname, pid_file, strerror(errno));
+			exit(1);
+		}
+	}
+	if (fscanf(pidf, "%ld", &pid) != 1)
+	{
+		write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+					 progname, pid_file);
+		exit(1);
+	}
+	fclose(pidf);
+	return (pgpid_t) pid;
+}
+
+
+/*
+ * get the lines from a text file - return NULL if file can't be opened
+ */
+static char **
+readfile(const char *path)
+{
+	FILE	   *infile;
+	int			maxlength = 0,
+				linelen = 0;
+	int			nlines = 0;
+	char	  **result;
+	char	   *buffer;
+	int			c;
+
+	if ((infile = fopen(path, "r")) == NULL)
+		return NULL;
+
+	/* pass over the file twice - the first time to size the result */
+
+	while ((c = fgetc(infile)) != EOF)
+	{
+		linelen++;
+		if (c == '\n')
+		{
+			nlines++;
+			if (linelen > maxlength)
+				maxlength = linelen;
+			linelen = 0;
+		}
+	}
+
+	/* handle last line without a terminating newline (yuck) */
+	if (linelen)
+		nlines++;
+	if (linelen > maxlength)
+		maxlength = linelen;
+
+	/* set up the result and the line buffer */
+	result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
+	buffer = (char *) pg_malloc(maxlength + 1);
+
+	/* now reprocess the file and store the lines */
+	rewind(infile);
+	nlines = 0;
+	while (fgets(buffer, maxlength + 1, infile) != NULL)
+		result[nlines++] = xstrdup(buffer);
+
+	fclose(infile);
+	free(buffer);
+	result[nlines] = NULL;
+
+	return result;
+}
+
+
+
+/*
+ * start/test/stop routines
+ */
+
+static int
+start_gtm(void)
+{
+	char		cmd[MAXPGPATH];
+	/*
+	 * Since there might be quotes to handle here, it is easier simply to pass
+	 * everything to a shell to process them.
+	 */
+
+	if (gtm_path != NULL)
+	{
+		strcat(gtm_path, "/");
+		strcat(gtm_path, gtm_app);
+	}
+	else
+		gtm_path = gtm_app;
+
+	if (log_file != NULL)
+		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
+				 gtm_path, gtmdata_opt, gtm_opts,
+				 DEVNULL, log_file);
+	else
+		snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
+				 gtm_path, gtmdata_opt, gtm_opts, DEVNULL);
+
+	return system(cmd);
+}
+
+
+
+/*
+ * Find the pgport and try a connection
+ */
+static bool
+test_gtm_connection()
+{
+	GTM_Conn	   *conn;
+	bool		success = false;
+	int			i;
+	char		portstr[32];
+	char	   *p;
+	char	   *q;
+	char		connstr[128];	/* Should be way more than enough! */
+
+	*portstr = '\0';
+
+	/*
+	 * Look in gtm_opts for a -p switch.
+	 *
+	 * This parsing code is not amazingly bright; it could for instance
+	 * get fooled if ' -p' occurs within a quoted argument value.  Given
+	 * that few people pass complicated settings in gtm_opts, it's
+	 * probably good enough.
+	 */
+	for (p = gtm_opts; *p;)
+	{
+		/* advance past whitespace */
+		while (isspace((unsigned char) *p))
+			p++;
+
+		if (strncmp(p, "-p", 2) == 0)
+		{
+			p += 2;
+			/* advance past any whitespace/quoting */
+			while (isspace((unsigned char) *p) || *p == '\'' || *p == '"')
+				p++;
+			/* find end of value (not including any ending quote!) */
+			q = p;
+			while (*q &&
+				   !(isspace((unsigned char) *q) || *q == '\'' || *q == '"'))
+				q++;
+			/* and save the argument value */
+			strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr)));
+			/* keep looking, maybe there is another -p */
+			p = q;
+		}
+		/* Advance to next whitespace */
+		while (*p && !isspace((unsigned char) *p))
+			p++;
+	}
+
+	/*
+	 * We need to set a connect timeout otherwise on Windows the SCM will
+	 * probably timeout first
+	 */
+	snprintf(connstr, sizeof(connstr),
+			 "host=localhost port=%s connect_timeout=5", portstr);
+
+	for (i = 0; i < wait_seconds; i++)
+	{
+		if ((conn = PQconnectGTM(connstr)) != NULL &&
+			(GTMPQstatus(conn) == CONNECTION_OK))
+		{
+			GTMPQfinish(conn);
+			success = true;
+			break;
+		}
+		else
+		{
+			GTMPQfinish(conn);
+			print_msg(".");
+			sleep(1); /* 1 sec */
+		}
+	}
+
+	return success;
+}
+
+static void
+read_gtm_opts(void)
+{
+	if (gtm_opts == NULL)
+	{
+		gtm_opts = "";		/* default */
+		if (ctl_command == RESTART_COMMAND)
+		{
+			char	  **optlines;
+
+			optlines = readfile(gtmopts_file);
+			if (optlines == NULL)
+			{
+				write_stderr(_("%s: could not read file \"%s\"\n"), progname, gtmopts_file);
+				exit(1);
+			}
+			else if (optlines[0] == NULL || optlines[1] != NULL)
+			{
+				write_stderr(_("%s: option file \"%s\" must have exactly one line\n"),
+							 progname, gtmopts_file);
+				exit(1);
+			}
+			else
+			{
+				int			len;
+				char	   *optline;
+				char	   *arg1;
+
+				optline = optlines[0];
+				/* trim off line endings */
+				len = strcspn(optline, "\r\n");
+				optline[len] = '\0';
+
+				gtm_opts = arg1;
+			}
+		}
+	}
+}
+
+static void
+do_start(void)
+{
+	pgpid_t		pid;
+	pgpid_t		old_pid = 0;
+	int			exitcode;
+
+	if (ctl_command != RESTART_COMMAND)
+	{
+		old_pid = get_pgpid();
+		if (old_pid != 0)
+			write_stderr(_("%s: another server might be running; "
+						   "trying to start server anyway\n"),
+						 progname);
+	}
+
+	read_gtm_opts();
+
+	exitcode = start_gtm();
+	if (exitcode != 0)
+	{
+		write_stderr(_("%s: could not start server: exit code was %d\n"),
+					 progname, exitcode);
+		exit(1);
+	}
+
+	if (old_pid != 0)
+	{
+		sleep(1);
+		pid = get_pgpid();
+		if (pid == old_pid)
+		{
+			write_stderr(_("%s: could not start server\n"
+						   "Examine the log output.\n"),
+						 progname);
+			exit(1);
+		}
+	}
+
+	if (do_wait)
+	{
+		print_msg(_("waiting for server to start..."));
+
+		if (test_gtm_connection() == false)
+		{
+			printf(_("could not start server\n"));
+			exit(1);
+		}
+		else
+		{
+			print_msg(_(" done\n"));
+			print_msg(_("server started\n"));
+		}
+	}
+	else
+		print_msg(_("server starting\n"));
+}
+
+
+static void
+do_stop(void)
+{
+	int			cnt;
+	pgpid_t		pid;
+
+	pid = get_pgpid();
+
+	if (pid == 0)				/* no pid file */
+	{
+		write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
+		write_stderr(_("Is server running?\n"));
+		exit(1);
+	}
+	else if (pid < 0)			/* standalone backend, not gtm */
+	{
+		pid = -pid;
+		write_stderr(_("%s: cannot stop server; "
+					   "single-user server is running (PID: %ld)\n"),
+					 progname, pid);
+		exit(1);
+	}
+
+	if (kill((pid_t) pid, sig) != 0)
+	{
+		write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+					 strerror(errno));
+		exit(1);
+	}
+
+	if (!do_wait)
+	{
+		print_msg(_("server shutting down\n"));
+		return;
+	}
+	else
+	{
+		print_msg(_("waiting for server to shut down..."));
+
+		for (cnt = 0; cnt < wait_seconds; cnt++)
+		{
+			if ((pid = get_pgpid()) != 0)
+			{
+				print_msg(".");
+				sleep(1);		/* 1 sec */
+			}
+			else
+				break;
+		}
+
+		if (pid != 0)			/* pid file still exists */
+		{
+			print_msg(_(" failed\n"));
+
+			write_stderr(_("%s: server does not shut down\n"), progname);
+			exit(1);
+		}
+		print_msg(_(" done\n"));
+
+		printf(_("server stopped\n"));
+	}
+}
+
+
+/*
+ *	restart/reload routines
+ */
+
+static void
+do_restart(void)
+{
+	int			cnt;
+	pgpid_t		pid;
+
+	pid = get_pgpid();
+
+	if (pid == 0)				/* no pid file */
+	{
+		write_stderr(_("%s: PID file \"%s\" does not exist\n"),
+					 progname, pid_file);
+		write_stderr(_("Is server running?\n"));
+		write_stderr(_("starting server anyway\n"));
+		do_start();
+		return;
+	}
+	else if (pid < 0)			/* standalone backend, not gtm */
+	{
+		pid = -pid;
+		if (gtm_is_alive((pid_t) pid))
+		{
+			write_stderr(_("%s: cannot restart server; "
+						   "single-user server is running (PID: %ld)\n"),
+						 progname, pid);
+			write_stderr(_("Please terminate the single-user server and try again.\n"));
+			exit(1);
+		}
+	}
+
+	if (gtm_is_alive((pid_t) pid))
+	{
+		if (kill((pid_t) pid, sig) != 0)
+		{
+			write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+						 strerror(errno));
+			exit(1);
+		}
+
+		print_msg(_("waiting for server to shut down..."));
+
+		/* always wait for restart */
+
+		for (cnt = 0; cnt < wait_seconds; cnt++)
+		{
+			if ((pid = get_pgpid()) != 0)
+			{
+				print_msg(".");
+				sleep(1);		/* 1 sec */
+			}
+			else
+				break;
+		}
+
+		if (pid != 0)			/* pid file still exists */
+		{
+			print_msg(_(" failed\n"));
+
+			write_stderr(_("%s: server does not shut down\n"), progname);
+			exit(1);
+		}
+
+		print_msg(_(" done\n"));
+		printf(_("server stopped\n"));
+	}
+	else
+	{
+		write_stderr(_("%s: old server process (PID: %ld) seems to be gone\n"),
+					 progname, pid);
+		write_stderr(_("starting server anyway\n"));
+	}
+
+	do_start();
+}
+
+
+/*
+ *	utility routines
+ */
+
+static bool
+gtm_is_alive(pid_t pid)
+{
+	/*
+	 * Test to see if the process is still there.  Note that we do not
+	 * consider an EPERM failure to mean that the process is still there;
+	 * EPERM must mean that the given PID belongs to some other userid, and
+	 * considering the permissions on $GTMDATA, that means it's not the
+	 * gtm we are after.
+	 *
+	 * Don't believe that our own PID or parent shell's PID is the gtm,
+	 * either.	(Windows hasn't got getppid(), though.)
+	 */
+	if (pid == getpid())
+		return false;
+#ifndef WIN32
+	if (pid == getppid())
+		return false;
+#endif
+	if (kill(pid, 0) == 0)
+		return true;
+	return false;
+}
+
+static void
+do_advice(void)
+{
+	write_stderr(_("Try \"%s --help\" for more information.\n"), progname);
+}
+
+
+static void
+do_help(void)
+{
+	printf(_("%s is a utility to start, stop or restart,\n"
+			 "a GTM server or GTM proxy.\n\n"), progname);
+	printf(_("Usage:\n"));
+	printf(_("  %s start   -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
+	printf(_("  %s stop    -S STARTUP_MODE [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
+	printf(_("  %s restart -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
+		 "                 [-o \"OPTIONS\"]\n"), progname);
+
+	printf(_("\nCommon options:\n"));
+	printf(_("  -D DATADIR             location of the database storage area\n"));
+	printf(_("  -S                     set gtm or gtm_proxy to launch one of them\n"));
+	printf(_("  -s, 				   only print errors, no informational messages\n"));
+	printf(_("  -t SECS                seconds to wait when using -w option\n"));
+	printf(_("  -w                     wait until operation completes\n"));
+	printf(_("  -W                     do not wait until operation completes\n"));
+	printf(_("  --help                 show this help, then exit\n"));
+	printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
+
+	printf(_("\nOptions for start or restart:\n"));
+	printf(_("  -S STARTUP-MODE        can be \"gtm\" or \"gtm_proxy\"\n"));
+	printf(_("  -l FILENAME            write (or append) server log to FILENAME\n"));
+	printf(_("  -o OPTIONS             command line options to pass to gtm\n"
+			 "                         (GTM server executable)\n"));
+	printf(_("  -p PATH-TO-GTM/PROXY   path to gtm/gtm_proxy executables\n"));
+	printf(_("\nOptions for stop or restart:\n"));
+	printf(_("  -m SHUTDOWN-MODE   can be \"smart\", \"fast\", or \"immediate\"\n"));
+
+	printf(_("\nShutdown modes are:\n"));
+	printf(_("  smart       quit after all clients have disconnected\n"));
+	printf(_("  fast        quit directly, with proper shutdown\n"));
+	printf(_("  immediate   quit without complete shutdown; will lead to recovery on restart\n"));
+}
+
+
+static void
+set_mode(char *modeopt)
+{
+	if (strcmp(modeopt, "s") == 0 || strcmp(modeopt, "smart") == 0)
+	{
+		shutdown_mode = SMART_MODE;
+		sig = SIGTERM;
+	}
+	else if (strcmp(modeopt, "f") == 0 || strcmp(modeopt, "fast") == 0)
+	{
+		shutdown_mode = FAST_MODE;
+		sig = SIGINT;
+	}
+	else if (strcmp(modeopt, "i") == 0 || strcmp(modeopt, "immediate") == 0)
+	{
+		shutdown_mode = IMMEDIATE_MODE;
+		sig = SIGQUIT;
+	}
+	else
+	{
+		write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"), progname, modeopt);
+		do_advice();
+		exit(1);
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int			c;
+
+	progname = "gtm_ctl";
+
+	/*
+	 * save argv[0] so do_start() can look for the gtm if necessary. we
+	 * don't look for gtm here because in many cases we won't need it.
+	 */
+	argv0 = argv[0];
+
+	umask(077);
+
+	/* support --help and --version even if invoked as root */
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ||
+			strcmp(argv[1], "-?") == 0)
+		{
+			do_help();
+			exit(0);
+		}
+	}
+
+	/*
+	 * Disallow running as root, to forestall any possible security holes.
+	 */
+	if (geteuid() == 0)
+	{
+		write_stderr(_("%s: cannot be run as root\n"
+					   "Please log in (using, e.g., \"su\") as the "
+					   "(unprivileged) user that will\n"
+					   "own the server process.\n"),
+					 progname);
+		exit(1);
+	}
+
+	/*
+	 * 'Action' can be before or after args so loop over both. Some
+	 * getopt_long() implementations will reorder argv[] to place all flags
+	 * first (GNU?), but we don't rely on it. Our /port version doesn't do
+	 * that.
+	 */
+	optind = 1;
+
+	/* process command-line options */
+	while (optind < argc)
+	{
+		while ((c = getopt(argc, argv, "D:l:m:o:p:S:t:wW")) != -1)
+		{
+			switch (c)
+			{
+				case 'D':
+					{
+						char	   *gtmdata_D;
+						char	   *env_var = pg_malloc(strlen(optarg) + 9);
+
+						gtmdata_D = xstrdup(optarg);
+						canonicalize_path(gtmdata_D);
+						snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s",
+								 gtmdata_D);
+						putenv(env_var);
+
+						/*
+						 * We could pass GTMDATA just in an environment
+						 * variable but we do -D too for clearer gtm
+						 * 'ps' display
+						 */
+						gtmdata_opt = pg_malloc(strlen(gtmdata_D) + 8);
+						snprintf(gtmdata_opt, strlen(gtmdata_D) + 8,
+								 "-D \"%s\" ",
+								 gtmdata_D);
+						break;
+					}
+				case 'l':
+					log_file = xstrdup(optarg);
+					break;
+				case 'm':
+					set_mode(optarg);
+					break;
+				case 'o':
+					gtm_opts = xstrdup(optarg);
+					break;
+				case 'p':
+					gtm_path = xstrdup(optarg);
+					canonicalize_path(gtm_path);
+					break;
+				case 'S':
+					gtm_app = xstrdup(optarg);
+					if (strcmp(gtm_app,"gtm_proxy") != 0
+						&& strcmp(gtm_app,"gtm") != 0)
+					{
+						write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app);
+						do_advice();
+						exit(1);
+					}
+					break;
+				case 't':
+					wait_seconds = atoi(optarg);
+					break;
+				case 'w':
+					do_wait = true;
+					wait_set = true;
+					break;
+				case 'W':
+					do_wait = false;
+					wait_set = true;
+					break;
+				default:
+					/* getopt_long already issued a suitable error message */
+					do_advice();
+					exit(1);
+			}
+		}
+
+		/* Process an action */
+		if (optind < argc)
+		{
+			if (ctl_command != NO_COMMAND)
+			{
+				write_stderr(_("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind]);
+				do_advice();
+				exit(1);
+			}
+
+			if (strcmp(argv[optind], "start") == 0)
+				ctl_command = START_COMMAND;
+			else if (strcmp(argv[optind], "stop") == 0)
+				ctl_command = STOP_COMMAND;
+			else if (strcmp(argv[optind], "restart") == 0)
+				ctl_command = RESTART_COMMAND;
+			else
+			{
+				write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), progname, argv[optind]);
+				do_advice();
+				exit(1);
+			}
+			optind++;
+		}
+	}
+
+	if (ctl_command == NO_COMMAND)
+	{
+		write_stderr(_("%s: no operation specified\n"), progname);
+		do_advice();
+		exit(1);
+	}
+
+	gtm_data = getenv("GTMDATA");
+
+	if (gtm_data)
+	{
+		gtm_data = xstrdup(gtm_data);
+		canonicalize_path(gtm_data);
+	}
+
+	if (!gtm_data)
+	{
+		write_stderr("%s: no database directory specified \n",
+					 progname);
+		do_advice();
+		exit(1);
+	}
+
+	/*
+	 * pid files of gtm and gtm proxy are named differently
+	 * -S option has also to be set for STOP_COMMAND
+	 * or gtm_ctl will not be able to find the correct pid_file
+	 */
+	if (!gtm_app)
+	{
+		write_stderr("%s: launcher name non specified, see option -S\n",
+					 progname);
+		do_advice();
+		exit(1);
+	}
+
+	if (!wait_set)
+	{
+		switch (ctl_command)
+		{
+			case RESTART_COMMAND:
+			case START_COMMAND:
+				do_wait = false;
+				break;
+			case STOP_COMMAND:
+				do_wait = true;
+				break;
+			default:
+				break;
+		}
+	}
+
+	if (gtm_data)
+	{
+		if (strcmp(gtm_app,"gtm_proxy") == 0)
+		{
+			snprintf(pid_file, MAXPGPATH, "%s/gtm_proxy.pid", gtm_data);
+			snprintf(gtmopts_file, MAXPGPATH, "%s/gtm_proxy.opts", gtm_data);
+		}
+		else if (strcmp(gtm_app,"gtm") == 0)
+		{
+			snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data);
+			snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data);
+		}
+	}
+
+	switch (ctl_command)
+	{
+		case START_COMMAND:
+			do_start();
+			break;
+		case STOP_COMMAND:
+			do_stop();
+			break;
+		case RESTART_COMMAND:
+			do_restart();
+			break;
+		default:
+			break;
+	}
+
+	exit(0);
+}
diff --git a/src/gtm/libpq/Makefile b/src/gtm/libpq/Makefile
new file mode 100644
index 0000000000..9036ba8547
--- /dev/null
+++ b/src/gtm/libpq/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=pqcomm
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=ip.o  pqcomm.o  pqformat.o strlcpy.o pqsignal.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+	rm -f $(OBJS)
+	rm -f libpqcomm.so libpqcomm.so.1 libpqcomm.so.1.0 
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/libpq/ip.c b/src/gtm/libpq/ip.c
new file mode 100644
index 0000000000..561161410d
--- /dev/null
+++ b/src/gtm/libpq/ip.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.c
+ *	  IPv6-aware network access.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $
+ *
+ * This file and the IPV6 implementation were initially provided by
+ * Nigel Kukard <[email protected]>, Linux Based Systems Design
+ * http://www.lbsd.net.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* This is intended to be used in both frontend and backend, so use c.h */
+#include "gtm/gtm_c.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#include <sys/file.h>
+
+#include "gtm/ip.h"
+
+
+static int range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+					   const struct sockaddr_in * netaddr,
+					   const struct sockaddr_in * netmask);
+
+#ifdef HAVE_IPV6
+static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+						const struct sockaddr_in6 * netaddr,
+						const struct sockaddr_in6 * netmask);
+#endif
+
+
+/*
+ *	pg_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
+ */
+int
+pg_getaddrinfo_all(const char *hostname, const char *servname,
+				   const struct addrinfo * hintp, struct addrinfo ** result)
+{
+	int			rc;
+
+	/* not all versions of getaddrinfo() zero *result on failure */
+	*result = NULL;
+
+	/* NULL has special meaning to getaddrinfo(). */
+	rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
+					 servname, hintp, result);
+
+	return rc;
+}
+
+
+/*
+ *	pg_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix
+ *
+ * Note: the ai_family field of the original hint structure must be passed
+ * so that we can tell whether the addrinfo struct was built by the system's
+ * getaddrinfo() routine or our own getaddrinfo_unix() routine.  Some versions
+ * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's
+ * not safe to look at ai_family in the addrinfo itself.
+ */
+void
+pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai)
+{
+	{
+		/* struct was built by getaddrinfo() */
+		if (ai != NULL)
+			freeaddrinfo(ai);
+	}
+}
+
+
+/*
+ *	pg_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets
+ *
+ * The API of this routine differs from the standard getnameinfo() definition
+ * in two ways: first, the addr parameter is declared as sockaddr_storage
+ * rather than struct sockaddr, and second, the node and service fields are
+ * guaranteed to be filled with something even on failure return.
+ */
+int
+pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+				   char *node, int nodelen,
+				   char *service, int servicelen,
+				   int flags)
+{
+	int			rc;
+
+	rc = getnameinfo((const struct sockaddr *) addr, salen,
+					 node, nodelen,
+					 service, servicelen,
+					 flags);
+
+	if (rc != 0)
+	{
+		if (node)
+			strlcpy(node, "???", nodelen);
+		if (service)
+			strlcpy(service, "???", servicelen);
+	}
+
+	return rc;
+}
+
+/*
+ * pg_range_sockaddr - is addr within the subnet specified by netaddr/netmask ?
+ *
+ * Note: caller must already have verified that all three addresses are
+ * in the same address family; and AF_UNIX addresses are not supported.
+ */
+int
+pg_range_sockaddr(const struct sockaddr_storage * addr,
+				  const struct sockaddr_storage * netaddr,
+				  const struct sockaddr_storage * netmask)
+{
+	if (addr->ss_family == AF_INET)
+		return range_sockaddr_AF_INET((struct sockaddr_in *) addr,
+									  (struct sockaddr_in *) netaddr,
+									  (struct sockaddr_in *) netmask);
+#ifdef HAVE_IPV6
+	else if (addr->ss_family == AF_INET6)
+		return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr,
+									   (struct sockaddr_in6 *) netaddr,
+									   (struct sockaddr_in6 *) netmask);
+#endif
+	else
+		return 0;
+}
+
+static int
+range_sockaddr_AF_INET(const struct sockaddr_in * addr,
+					   const struct sockaddr_in * netaddr,
+					   const struct sockaddr_in * netmask)
+{
+	if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) &
+		 netmask->sin_addr.s_addr) == 0)
+		return 1;
+	else
+		return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+static int
+range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
+						const struct sockaddr_in6 * netaddr,
+						const struct sockaddr_in6 * netmask)
+{
+	int			i;
+
+	for (i = 0; i < 16; i++)
+	{
+		if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) &
+			 netmask->sin6_addr.s6_addr[i]) != 0)
+			return 0;
+	}
+
+	return 1;
+}
+#endif   /* HAVE_IPV6 */
+
+/*
+ *	pg_sockaddr_cidr_mask - make a network mask of the appropriate family
+ *	  and required number of significant bits
+ *
+ * The resulting mask is placed in *mask, which had better be big enough.
+ *
+ * Return value is 0 if okay, -1 if not.
+ */
+int
+pg_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family)
+{
+	long		bits;
+	char	   *endptr;
+
+	bits = strtol(numbits, &endptr, 10);
+
+	if (*numbits == '\0' || *endptr != '\0')
+		return -1;
+
+	switch (family)
+	{
+		case AF_INET:
+			{
+				struct sockaddr_in mask4;
+				long		maskl;
+
+				if (bits < 0 || bits > 32)
+					return -1;
+				/* avoid "x << 32", which is not portable */
+				if (bits > 0)
+					maskl = (0xffffffffUL << (32 - (int) bits))
+						& 0xffffffffUL;
+				else
+					maskl = 0;
+				mask4.sin_addr.s_addr = htonl(maskl);
+				memcpy(mask, &mask4, sizeof(mask4));
+				break;
+			}
+
+#ifdef HAVE_IPV6
+		case AF_INET6:
+			{
+				struct sockaddr_in6 mask6;
+				int			i;
+
+				if (bits < 0 || bits > 128)
+					return -1;
+				for (i = 0; i < 16; i++)
+				{
+					if (bits <= 0)
+						mask6.sin6_addr.s6_addr[i] = 0;
+					else if (bits >= 8)
+						mask6.sin6_addr.s6_addr[i] = 0xff;
+					else
+					{
+						mask6.sin6_addr.s6_addr[i] =
+							(0xff << (8 - (int) bits)) & 0xff;
+					}
+					bits -= 8;
+				}
+				memcpy(mask, &mask6, sizeof(mask6));
+				break;
+			}
+#endif
+		default:
+			return -1;
+	}
+
+	mask->ss_family = family;
+	return 0;
+}
+
+
+#ifdef HAVE_IPV6
+
+/*
+ * pg_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using
+ *		the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result!  Note that we only worry about setting the fields
+ * that pg_range_sockaddr will look at.
+ */
+void
+pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr)
+{
+	struct sockaddr_in addr4;
+	struct sockaddr_in6 addr6;
+	uint32		ip4addr;
+
+	memcpy(&addr4, addr, sizeof(addr4));
+	ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+	memset(&addr6, 0, sizeof(addr6));
+
+	addr6.sin6_family = AF_INET6;
+
+	addr6.sin6_addr.s6_addr[10] = 0xff;
+	addr6.sin6_addr.s6_addr[11] = 0xff;
+	addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+	addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+	addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+	addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+	memcpy(addr, &addr6, sizeof(addr6));
+}
+
+/*
+ * pg_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using
+ *		the standard convention for IPv4 addresses mapped into IPv6 world
+ *
+ * This must be different from pg_promote_v4_to_v6_addr because we want to
+ * set the high-order bits to 1's not 0's.
+ *
+ * The passed addr is modified in place; be sure it is large enough to
+ * hold the result!  Note that we only worry about setting the fields
+ * that pg_range_sockaddr will look at.
+ */
+void
+pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
+{
+	struct sockaddr_in addr4;
+	struct sockaddr_in6 addr6;
+	uint32		ip4addr;
+	int			i;
+
+	memcpy(&addr4, addr, sizeof(addr4));
+	ip4addr = ntohl(addr4.sin_addr.s_addr);
+
+	memset(&addr6, 0, sizeof(addr6));
+
+	addr6.sin6_family = AF_INET6;
+
+	for (i = 0; i < 12; i++)
+		addr6.sin6_addr.s6_addr[i] = 0xff;
+
+	addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF;
+	addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF;
+	addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF;
+	addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF;
+
+	memcpy(addr, &addr6, sizeof(addr6));
+}
+
+#endif   /* HAVE_IPV6 */
diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c
new file mode 100644
index 0000000000..e697a7f4b1
--- /dev/null
+++ b/src/gtm/libpq/pqcomm.c
@@ -0,0 +1,1130 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqcomm.c
+ *	  Communication functions between the Frontend and the Backend
+ *
+ * These routines handle the low-level details of communication between
+ * frontend and backend.  They just shove data across the communication
+ * channel, and are ignorant of the semantics of the data --- or would be,
+ * except for major brain damage in the design of the old COPY OUT protocol.
+ * Unfortunately, COPY OUT was designed to commandeer the communication
+ * channel (it just transfers data without wrapping it into messages).
+ * No other messages can be sent while COPY OUT is in progress; and if the
+ * copy is aborted by an ereport(ERROR), we need to close out the copy so that
+ * the frontend gets back into sync.  Therefore, these routines have to be
+ * aware of COPY OUT state.  (New COPY-OUT is message-based and does *not*
+ * set the DoingCopyOut flag.)
+ *
+ * NOTE: generally, it's a bad idea to emit outgoing messages directly with
+ * pq_putbytes(), especially if the message would require multiple calls
+ * to send.  Instead, use the routines in pqformat.c to construct the message
+ * in a buffer and then emit it in one call to pq_putmessage.  This ensures
+ * that the channel will not be clogged by an incomplete message if execution
+ * is aborted by ereport(ERROR) partway through the message.  The only
+ * non-libpq code that should call pq_putbytes directly is old-style COPY OUT.
+ *
+ * At one time, libpq was shared between frontend and backend, but now
+ * the backend's "backend/libpq" is quite separate from "interfaces/libpq".
+ * All that remains is similarities of names to trap the unwary...
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *	$PostgreSQL: pgsql/src/backend/libpq/pqcomm.c,v 1.198 2008/01/01 19:45:49 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*------------------------
+ * INTERFACE ROUTINES
+ *
+ * setup/teardown:
+ *		StreamServerPort	- Open postmaster's server port
+ *		StreamConnection	- Create new connection with client
+ *		StreamClose			- Close a client/backend connection
+ *		TouchSocketFile		- Protect socket file against /tmp cleaners
+ *		pq_init			- initialize libpq at backend startup
+ *		pq_comm_reset	- reset libpq during error recovery
+ *		pq_close		- shutdown libpq at backend exit
+ *
+ * low-level I/O:
+ *		pq_getbytes		- get a known number of bytes from connection
+ *		pq_getstring	- get a null terminated string from connection
+ *		pq_getmessage	- get a message with length word from connection
+ *		pq_getbyte		- get next byte from connection
+ *		pq_peekbyte		- peek at next byte from connection
+ *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
+ *		pq_flush		- flush pending output
+ *
+ * message-level I/O (and old-style-COPY-OUT cruft):
+ *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
+ *		pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
+ *		pq_endcopyout	- end a COPY OUT transfer
+ *
+ *------------------------
+ */
+
+#include <signal.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+#include <arpa/inet.h>
+#ifdef HAVE_UTIME_H
+#include <utime.h>
+#endif
+
+#include "gtm/gtm_c.h"
+#include "gtm/ip.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/elog.h"
+
+#define MAXGTMPATH	256
+
+/* Where the Unix socket file is */
+static char sock_path[MAXGTMPATH];
+
+static int         tcp_keepalives_idle;
+static int         tcp_keepalives_interval;
+static int         tcp_keepalives_count;
+
+
+/*
+ * Buffers for low-level I/O
+ */
+
+/* Internal functions */
+static int	internal_putbytes(Port *myport, const char *s, size_t len);
+static int	internal_flush(Port *myport);
+
+/*
+ * Streams -- wrapper around Unix socket system calls
+ *
+ *
+ *		Stream functions are used for vanilla TCP connection protocol.
+ */
+
+
+/*
+ * StreamServerPort -- open a "listening" port to accept connections.
+ *
+ * Successfully opened sockets are added to the ListenSocket[] array,
+ * at the first position that isn't -1.
+ *
+ * RETURNS: STATUS_OK or STATUS_ERROR
+ */
+
+int
+StreamServerPort(int family, char *hostName, unsigned short portNumber,
+				 int ListenSocket[], int MaxListen)
+{
+	int			fd,
+				err;
+	int			maxconn;
+	int			ret;
+	char		portNumberStr[32];
+	const char *familyDesc;
+	char		familyDescBuf[64];
+	char	   *service;
+	struct addrinfo *addrs = NULL,
+			   *addr;
+	struct addrinfo hint;
+	int			listen_index = 0;
+	int			added = 0;
+
+#if !defined(WIN32) || defined(IPV6_V6ONLY)
+	int			one = 1;
+#endif
+
+	/* Initialize hint structure */
+	MemSet(&hint, 0, sizeof(hint));
+	hint.ai_family = family;
+	hint.ai_flags = AI_PASSIVE;
+	hint.ai_socktype = SOCK_STREAM;
+
+	{
+		snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber);
+		service = portNumberStr;
+	}
+
+	ret = pg_getaddrinfo_all(hostName, service, &hint, &addrs);
+	if (ret || !addrs)
+	{
+		if (hostName)
+			ereport(LOG,
+					(errmsg("could not translate host name \"%s\", service \"%s\" to address: %s",
+							hostName, service, gai_strerror(ret))));
+		else
+			ereport(LOG,
+				 (errmsg("could not translate service \"%s\" to address: %s",
+						 service, gai_strerror(ret))));
+		if (addrs)
+			pg_freeaddrinfo_all(hint.ai_family, addrs);
+		return STATUS_ERROR;
+	}
+
+	for (addr = addrs; addr; addr = addr->ai_next)
+	{
+		if (!IS_AF_UNIX(family) && IS_AF_UNIX(addr->ai_family))
+		{
+			/*
+			 * Only set up a unix domain socket when they really asked for it.
+			 * The service/port is different in that case.
+			 */
+			continue;
+		}
+
+		/* See if there is still room to add 1 more socket. */
+		for (; listen_index < MaxListen; listen_index++)
+		{
+			if (ListenSocket[listen_index] == -1)
+				break;
+		}
+		if (listen_index >= MaxListen)
+		{
+			ereport(LOG,
+					(errmsg("could not bind to all requested addresses: MAXLISTEN (%d) exceeded",
+							MaxListen)));
+			break;
+		}
+
+		/* set up family name for possible error messages */
+		switch (addr->ai_family)
+		{
+			case AF_INET:
+				familyDesc = "IPv4";
+				break;
+#ifdef HAVE_IPV6
+			case AF_INET6:
+				familyDesc = "IPv6";
+				break;
+#endif
+			default:
+				snprintf(familyDescBuf, sizeof(familyDescBuf),
+						 "unrecognized address family %d",
+						 addr->ai_family);
+				familyDesc = familyDescBuf;
+				break;
+		}
+
+		if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0)
+		{
+			ereport(LOG,
+					(EACCES,
+			/* translator: %s is IPv4, IPv6, or Unix */
+					 errmsg("could not create %s socket: %m",
+							familyDesc)));
+			continue;
+		}
+
+#ifndef WIN32
+
+		/*
+		 * Without the SO_REUSEADDR flag, a new postmaster can't be started
+		 * right away after a stop or crash, giving "address already in use"
+		 * error on TCP ports.
+		 *
+		 * On win32, however, this behavior only happens if the
+		 * SO_EXLUSIVEADDRUSE is set. With SO_REUSEADDR, win32 allows multiple
+		 * servers to listen on the same address, resulting in unpredictable
+		 * behavior. With no flags at all, win32 behaves as Unix with
+		 * SO_REUSEADDR.
+		 */
+		if (!IS_AF_UNIX(addr->ai_family))
+		{
+			if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+							(char *) &one, sizeof(one))) == -1)
+			{
+				ereport(LOG,
+						(EACCES,
+						 errmsg("setsockopt(SO_REUSEADDR) failed: %m")));
+				close(fd);
+				continue;
+			}
+		}
+#endif
+
+#ifdef IPV6_V6ONLY
+		if (addr->ai_family == AF_INET6)
+		{
+			if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
+						   (char *) &one, sizeof(one)) == -1)
+			{
+				ereport(LOG,
+						(EACCES,
+						 errmsg("setsockopt(IPV6_V6ONLY) failed: %m")));
+				close(fd);
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * Note: This might fail on some OS's, like Linux older than
+		 * 2.4.21-pre3, that don't have the IPV6_V6ONLY socket option, and map
+		 * ipv4 addresses to ipv6.	It will show ::ffff:ipv4 for all ipv4
+		 * connections.
+		 */
+		err = bind(fd, addr->ai_addr, addr->ai_addrlen);
+		if (err < 0)
+		{
+			ereport(LOG,
+					(EACCES,
+			/* translator: %s is IPv4, IPv6, or Unix */
+					 errmsg("could not bind %s socket: %m",
+							familyDesc),
+					 (IS_AF_UNIX(addr->ai_family)) ?
+				  errhint("Is another postmaster already running on port %d?"
+						  " If not, remove socket file \"%s\" and retry.",
+						  (int) portNumber, sock_path) :
+				  errhint("Is another postmaster already running on port %d?"
+						  " If not, wait a few seconds and retry.",
+						  (int) portNumber)));
+			close(fd);
+			continue;
+		}
+
+#define GTM_MAX_CONNECTIONS		1024
+
+		/*
+		 * Select appropriate accept-queue length limit.  PG_SOMAXCONN is only
+		 * intended to provide a clamp on the request on platforms where an
+		 * overly large request provokes a kernel error (are there any?).
+		 */
+		maxconn = GTM_MAX_CONNECTIONS * 2;
+
+		err = listen(fd, maxconn);
+		if (err < 0)
+		{
+			ereport(LOG,
+					(EACCES,
+			/* translator: %s is IPv4, IPv6, or Unix */
+					 errmsg("could not listen on %s socket: %m",
+							familyDesc)));
+			close(fd);
+			continue;
+		}
+		ListenSocket[listen_index] = fd;
+		added++;
+	}
+
+	pg_freeaddrinfo_all(hint.ai_family, addrs);
+
+	if (!added)
+		return STATUS_ERROR;
+
+	return STATUS_OK;
+}
+
+
+/*
+ * StreamConnection -- create a new connection with client using
+ *		server port.  Set port->sock to the FD of the new connection.
+ *
+ * ASSUME: that this doesn't need to be non-blocking because
+ *		the Postmaster uses select() to tell when the server master
+ *		socket is ready for accept().
+ *
+ * RETURNS: STATUS_OK or STATUS_ERROR
+ */
+int
+StreamConnection(int server_fd, Port *port)
+{
+	/* accept connection and fill in the client (remote) address */
+	port->raddr.salen = sizeof(port->raddr.addr);
+	if ((port->sock = accept(server_fd,
+							 (struct sockaddr *) & port->raddr.addr,
+							 &port->raddr.salen)) < 0)
+	{
+		ereport(LOG,
+				(EACCES,
+				 errmsg("could not accept new connection: %m")));
+
+		/*
+		 * If accept() fails then postmaster.c will still see the server
+		 * socket as read-ready, and will immediately try again.  To avoid
+		 * uselessly sucking lots of CPU, delay a bit before trying again.
+		 * (The most likely reason for failure is being out of kernel file
+		 * table slots; we can do little except hope some will get freed up.)
+		 */
+	/*	pg_usleep(100000L);	 */	/* wait 0.1 sec */
+		return STATUS_ERROR;
+	}
+
+#ifdef SCO_ACCEPT_BUG
+
+	/*
+	 * UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it
+	 * shouldn't hurt to catch it for all versions of those platforms.
+	 */
+	if (port->raddr.addr.ss_family == 0)
+		port->raddr.addr.ss_family = AF_UNIX;
+#endif
+
+	/* fill in the server (local) address */
+	port->laddr.salen = sizeof(port->laddr.addr);
+	if (getsockname(port->sock,
+					(struct sockaddr *) & port->laddr.addr,
+					&port->laddr.salen) < 0)
+	{
+		elog(LOG, "getsockname() failed: %m");
+		return STATUS_ERROR;
+	}
+
+	/* select NODELAY and KEEPALIVE options if it's a TCP connection */
+	if (!IS_AF_UNIX(port->laddr.addr.ss_family))
+	{
+		int			on;
+
+#ifdef	TCP_NODELAY
+		on = 1;
+		if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY,
+					   (char *) &on, sizeof(on)) < 0)
+		{
+			elog(LOG, "setsockopt(TCP_NODELAY) failed: %m");
+			return STATUS_ERROR;
+		}
+#endif
+		on = 1;
+		if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE,
+					   (char *) &on, sizeof(on)) < 0)
+		{
+			elog(LOG, "setsockopt(SO_KEEPALIVE) failed: %m");
+			return STATUS_ERROR;
+		}
+
+		/*
+		 * Also apply the current keepalive parameters.  If we fail to set a
+		 * parameter, don't error out, because these aren't universally
+		 * supported.  (Note: you might think we need to reset the GUC
+		 * variables to 0 in such a case, but it's not necessary because the
+		 * show hooks for these variables report the truth anyway.)
+		 */
+		(void) pq_setkeepalivesidle(tcp_keepalives_idle, port);
+		(void) pq_setkeepalivesinterval(tcp_keepalives_interval, port);
+		(void) pq_setkeepalivescount(tcp_keepalives_count, port);
+	}
+
+	return STATUS_OK;
+}
+
+/*
+ * StreamClose -- close a client/backend connection
+ *
+ * NOTE: this is NOT used to terminate a session; it is just used to release
+ * the file descriptor in a process that should no longer have the socket
+ * open.  (For example, the postmaster calls this after passing ownership
+ * of the connection to a child process.)  It is expected that someone else
+ * still has the socket open.  So, we only want to close the descriptor,
+ * we do NOT want to send anything to the far end.
+ */
+void
+StreamClose(int sock)
+{
+	close(sock);
+}
+
+/*
+ * TouchSocketFile -- mark socket file as recently accessed
+ *
+ * This routine should be called every so often to ensure that the socket
+ * file has a recent mod date (ordinary operations on sockets usually won't
+ * change the mod date).  That saves it from being removed by
+ * overenthusiastic /tmp-directory-cleaner daemons.  (Another reason we should
+ * never have put the socket file in /tmp...)
+ */
+void
+TouchSocketFile(void)
+{
+	/* Do nothing if we did not create a socket... */
+	if (sock_path[0] != '\0')
+	{
+		/*
+		 * utime() is POSIX standard, utimes() is a common alternative. If we
+		 * have neither, there's no way to affect the mod or access time of
+		 * the socket :-(
+		 *
+		 * In either path, we ignore errors; there's no point in complaining.
+		 */
+#ifdef HAVE_UTIME
+		utime(sock_path, NULL);
+#else							/* !HAVE_UTIME */
+#ifdef HAVE_UTIMES
+		utimes(sock_path, NULL);
+#endif   /* HAVE_UTIMES */
+#endif   /* HAVE_UTIME */
+	}
+}
+
+
+/* --------------------------------
+ * Low-level I/O routines begin here.
+ *
+ * These routines communicate with a frontend client across a connection
+ * already established by the preceding routines.
+ * --------------------------------
+ */
+
+
+/* --------------------------------
+ *		pq_recvbuf - load some bytes into the input buffer
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pq_recvbuf(Port *myport)
+{
+	if (myport->PqRecvPointer > 0)
+	{
+		if (myport->PqRecvLength > myport->PqRecvPointer)
+		{
+			/* still some unread data, left-justify it in the buffer */
+			memmove(myport->PqRecvBuffer, myport->PqRecvBuffer + myport->PqRecvPointer,
+					myport->PqRecvLength - myport->PqRecvPointer);
+			myport->PqRecvLength -= myport->PqRecvPointer;
+			myport->PqRecvPointer = 0;
+		}
+		else
+			myport->PqRecvLength = myport->PqRecvPointer = 0;
+	}
+
+	/* Can fill buffer from myport->PqRecvLength and upwards */
+	for (;;)
+	{
+		int			r;
+
+		r = recv(myport->sock, myport->PqRecvBuffer + myport->PqRecvLength,
+						PQ_BUFFER_SIZE - myport->PqRecvLength, 0);
+
+		if (r < 0)
+		{
+			if (errno == EINTR)
+				continue;		/* Ok if interrupted */
+
+			/*
+			 * Careful: an ereport() that tries to write to the client would
+			 * cause recursion to here, leading to stack overflow and core
+			 * dump!  This message must go *only* to the postmaster log.
+			 */
+			ereport(COMMERROR,
+					(EACCES,
+					 errmsg("could not receive data from client: %m")));
+			return EOF;
+		}
+		if (r == 0)
+		{
+			/*
+			 * EOF detected.  We used to write a log message here, but it's
+			 * better to expect the ultimate caller to do that.
+			 */
+			return EOF;
+		}
+		/* r contains number of bytes read, so just incr length */
+		myport->PqRecvLength += r;
+		return 0;
+	}
+}
+
+/* --------------------------------
+ *		pq_getbyte	- get a single byte from connection, or return EOF
+ * --------------------------------
+ */
+int
+pq_getbyte(Port *myport)
+{
+	while (myport->PqRecvPointer >= myport->PqRecvLength)
+	{
+		if (pq_recvbuf(myport))		/* If nothing in buffer, then recv some */
+			return EOF;			/* Failed to recv data */
+	}
+	return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer++];
+}
+
+/* --------------------------------
+ *		pq_peekbyte		- peek at next byte from connection
+ *
+ *	 Same as pq_getbyte() except we don't advance the pointer.
+ * --------------------------------
+ */
+int
+pq_peekbyte(Port *myport)
+{
+	while (myport->PqRecvPointer >= myport->PqRecvLength)
+	{
+		if (pq_recvbuf(myport))		/* If nothing in buffer, then recv some */
+			return EOF;			/* Failed to recv data */
+	}
+	return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer];
+}
+
+/* --------------------------------
+ *		pq_getbytes		- get a known number of bytes from connection
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getbytes(Port *myport, char *s, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		while (myport->PqRecvPointer >= myport->PqRecvLength)
+		{
+			if (pq_recvbuf(myport))	/* If nothing in buffer, then recv some */
+				return EOF;		/* Failed to recv data */
+		}
+		amount = myport->PqRecvLength - myport->PqRecvPointer;
+		if (amount > len)
+			amount = len;
+		memcpy(s, myport->PqRecvBuffer + myport->PqRecvPointer, amount);
+		myport->PqRecvPointer += amount;
+		s += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+/* --------------------------------
+ *		pq_discardbytes		- throw away a known number of bytes
+ *
+ *		same as pq_getbytes except we do not copy the data to anyplace.
+ *		this is used for resynchronizing after read errors.
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+static int
+pq_discardbytes(Port *myport, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		while (myport->PqRecvPointer >= myport->PqRecvLength)
+		{
+			if (pq_recvbuf(myport))	/* If nothing in buffer, then recv some */
+				return EOF;		/* Failed to recv data */
+		}
+		amount = myport->PqRecvLength - myport->PqRecvPointer;
+		if (amount > len)
+			amount = len;
+		myport->PqRecvPointer += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+/* --------------------------------
+ *		pq_getstring	- get a null terminated string from connection
+ *
+ *		The return value is placed in an expansible StringInfo, which has
+ *		already been initialized by the caller.
+ *
+ *		This is used only for dealing with old-protocol clients.  The idea
+ *		is to produce a StringInfo that looks the same as we would get from
+ *		pq_getmessage() with a newer client; we will then process it with
+ *		pq_getmsgstring.  Therefore, no character set conversion is done here,
+ *		even though this is presumably useful only for text.
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getstring(Port *myport, StringInfo s)
+{
+	int			i;
+
+	resetStringInfo(s);
+
+	/* Read until we get the terminating '\0' */
+	for (;;)
+	{
+		while (myport->PqRecvPointer >= myport->PqRecvLength)
+		{
+			if (pq_recvbuf(myport))	/* If nothing in buffer, then recv some */
+				return EOF;		/* Failed to recv data */
+		}
+
+		for (i = myport->PqRecvPointer; i < myport->PqRecvLength; i++)
+		{
+			if (myport->PqRecvBuffer[i] == '\0')
+			{
+				/* include the '\0' in the copy */
+				appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer,
+									   i - myport->PqRecvPointer + 1);
+				myport->PqRecvPointer = i + 1;	/* advance past \0 */
+				return 0;
+			}
+		}
+
+		/* If we're here we haven't got the \0 in the buffer yet. */
+		appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer,
+							   myport->PqRecvLength - myport->PqRecvPointer);
+		myport->PqRecvPointer = myport->PqRecvLength;
+	}
+}
+
+
+/* --------------------------------
+ *		pq_getmessage	- get a message with length word from connection
+ *
+ *		The return value is placed in an expansible StringInfo, which has
+ *		already been initialized by the caller.
+ *		Only the message body is placed in the StringInfo; the length word
+ *		is removed.  Also, s->cursor is initialized to zero for convenience
+ *		in scanning the message contents.
+ *
+ *		If maxlen is not zero, it is an upper limit on the length of the
+ *		message we are willing to accept.  We abort the connection (by
+ *		returning EOF) if client tries to send more than that.
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_getmessage(Port *myport, StringInfo s, int maxlen)
+{
+	int32		len;
+
+	resetStringInfo(s);
+
+	/* Read message length word */
+	if (pq_getbytes(myport, (char *) &len, 4) == EOF)
+	{
+		ereport(COMMERROR,
+				(EPROTO,
+				 errmsg("unexpected EOF within message length word")));
+		return EOF;
+	}
+
+	len = ntohl(len);
+
+	if (len < 4 ||
+		(maxlen > 0 && len > maxlen))
+	{
+		ereport(COMMERROR,
+				(EPROTO,
+				 errmsg("invalid message length")));
+		return EOF;
+	}
+
+	len -= 4;					/* discount length itself */
+
+	if (len > 0)
+	{
+		/*
+		 * Allocate space for message.	If we run out of room (ridiculously
+		 * large message), we will elog(ERROR), but we want to discard the
+		 * message body so as not to lose communication sync.
+		 */
+		enlargeStringInfo(s, len);
+
+		/* And grab the message */
+		if (pq_getbytes(myport, s->data, len) == EOF)
+		{
+			ereport(COMMERROR,
+					(EPROTO,
+					 errmsg("incomplete message from client")));
+			return EOF;
+		}
+		s->len = len;
+		/* Place a trailing null per StringInfo convention */
+		s->data[len] = '\0';
+	}
+
+	return 0;
+}
+
+
+/* --------------------------------
+ *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_putbytes(Port *myport, const char *s, size_t len)
+{
+	int			res;
+
+	res = internal_putbytes(myport, s, len);
+	return res;
+}
+
+static int
+internal_putbytes(Port *myport, const char *s, size_t len)
+{
+	size_t		amount;
+
+	while (len > 0)
+	{
+		/* If buffer is full, then flush it out */
+		if (myport->PqSendPointer >= PQ_BUFFER_SIZE)
+			if (internal_flush(myport))
+				return EOF;
+		amount = PQ_BUFFER_SIZE - myport->PqSendPointer;
+		if (amount > len)
+			amount = len;
+		memcpy(myport->PqSendBuffer + myport->PqSendPointer, s, amount);
+		myport->PqSendPointer += amount;
+		s += amount;
+		len -= amount;
+	}
+	return 0;
+}
+
+/* --------------------------------
+ *		pq_flush		- flush pending output
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_flush(Port *myport)
+{
+	int			res;
+
+	/* No-op if reentrant call */
+	res = internal_flush(myport);
+	return res;
+}
+
+static int
+internal_flush(Port *myport)
+{
+	static int	last_reported_send_errno = 0;
+
+	char	   *bufptr = myport->PqSendBuffer;
+	char	   *bufend = myport->PqSendBuffer + myport->PqSendPointer;
+
+	while (bufptr < bufend)
+	{
+		int			r;
+
+		r = send(myport->sock, bufptr, bufend - bufptr, 0);
+
+		if (r <= 0)
+		{
+			if (errno == EINTR)
+				continue;		/* Ok if we were interrupted */
+
+			/*
+			 * Careful: an ereport() that tries to write to the client would
+			 * cause recursion to here, leading to stack overflow and core
+			 * dump!  This message must go *only* to the postmaster log.
+			 *
+			 * If a client disconnects while we're in the midst of output, we
+			 * might write quite a bit of data before we get to a safe query
+			 * abort point.  So, suppress duplicate log messages.
+			 */
+			if (errno != last_reported_send_errno)
+			{
+				last_reported_send_errno = errno;
+				ereport(COMMERROR,
+						(EACCES,
+						 errmsg("could not send data to client: %m")));
+			}
+
+			/*
+			 * We drop the buffered data anyway so that processing can
+			 * continue, even though we'll probably quit soon.
+			 */
+			myport->PqSendPointer = 0;
+			return EOF;
+		}
+
+		last_reported_send_errno = 0;	/* reset after any successful send */
+		bufptr += r;
+	}
+
+	myport->PqSendPointer = 0;
+	return 0;
+}
+
+
+/* --------------------------------
+ * Message-level I/O routines begin here.
+ *
+ * These routines understand about the old-style COPY OUT protocol.
+ * --------------------------------
+ */
+
+
+/* --------------------------------
+ *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
+ *
+ *		If msgtype is not '\0', it is a message type code to place before
+ *		the message body.  If msgtype is '\0', then the message has no type
+ *		code (this is only valid in pre-3.0 protocols).
+ *
+ *		len is the length of the message body data at *s.  In protocol 3.0
+ *		and later, a message length word (equal to len+4 because it counts
+ *		itself too) is inserted by this routine.
+ *
+ *		All normal messages are suppressed while old-style COPY OUT is in
+ *		progress.  (In practice only a few notice messages might get emitted
+ *		then; dropping them is annoying, but at least they will still appear
+ *		in the postmaster log.)
+ *
+ *		We also suppress messages generated while pqcomm.c is busy.  This
+ *		avoids any possibility of messages being inserted within other
+ *		messages.  The only known trouble case arises if SIGQUIT occurs
+ *		during a pqcomm.c routine --- quickdie() will try to send a warning
+ *		message, and the most reasonable approach seems to be to drop it.
+ *
+ *		returns 0 if OK, EOF if trouble
+ * --------------------------------
+ */
+int
+pq_putmessage(Port *myport, char msgtype, const char *s, size_t len)
+{
+	uint32		n32;
+	if (msgtype)
+		if (internal_putbytes(myport, &msgtype, 1))
+			goto fail;
+
+	n32 = htonl((uint32) (len + 4));
+	if (internal_putbytes(myport, (char *) &n32, 4))
+		goto fail;
+	
+	if (internal_putbytes(myport, s, len))
+		goto fail;
+	return 0;
+
+fail:
+	return EOF;
+}
+
+
+/*
+ * Support for TCP Keepalive parameters
+ */
+
+int
+pq_getkeepalivesidle(Port *port)
+{
+#ifdef TCP_KEEPIDLE
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return 0;
+
+	if (port->keepalives_idle != 0)
+		return port->keepalives_idle;
+
+	if (port->default_keepalives_idle == 0)
+	{
+		ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_idle);
+
+		if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
+					   (char *) &port->default_keepalives_idle,
+					   &size) < 0)
+		{
+			elog(LOG, "getsockopt(TCP_KEEPIDLE) failed: %m");
+			port->default_keepalives_idle = -1; /* don't know */
+		}
+	}
+
+	return port->default_keepalives_idle;
+#else
+	return 0;
+#endif
+}
+
+int
+pq_setkeepalivesidle(int idle, Port *port)
+{
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return STATUS_OK;
+
+#ifdef TCP_KEEPIDLE
+	if (idle == port->keepalives_idle)
+		return STATUS_OK;
+
+	if (port->default_keepalives_idle <= 0)
+	{
+		if (pq_getkeepalivesidle(port) < 0)
+		{
+			if (idle == 0)
+				return STATUS_OK;		/* default is set but unknown */
+			else
+				return STATUS_ERROR;
+		}
+	}
+
+	if (idle == 0)
+		idle = port->default_keepalives_idle;
+
+	if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
+				   (char *) &idle, sizeof(idle)) < 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPIDLE) failed: %m");
+		return STATUS_ERROR;
+	}
+
+	port->keepalives_idle = idle;
+#else
+	if (idle != 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPIDLE) not supported");
+		return STATUS_ERROR;
+	}
+#endif
+
+	return STATUS_OK;
+}
+
+int
+pq_getkeepalivesinterval(Port *port)
+{
+#ifdef TCP_KEEPINTVL
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return 0;
+
+	if (port->keepalives_interval != 0)
+		return port->keepalives_interval;
+
+	if (port->default_keepalives_interval == 0)
+	{
+		ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_interval);
+
+		if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
+					   (char *) &port->default_keepalives_interval,
+					   &size) < 0)
+		{
+			elog(LOG, "getsockopt(TCP_KEEPINTVL) failed: %m");
+			port->default_keepalives_interval = -1;		/* don't know */
+		}
+	}
+
+	return port->default_keepalives_interval;
+#else
+	return 0;
+#endif
+}
+
+int
+pq_setkeepalivesinterval(int interval, Port *port)
+{
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return STATUS_OK;
+
+#ifdef TCP_KEEPINTVL
+	if (interval == port->keepalives_interval)
+		return STATUS_OK;
+
+	if (port->default_keepalives_interval <= 0)
+	{
+		if (pq_getkeepalivesinterval(port) < 0)
+		{
+			if (interval == 0)
+				return STATUS_OK;		/* default is set but unknown */
+			else
+				return STATUS_ERROR;
+		}
+	}
+
+	if (interval == 0)
+		interval = port->default_keepalives_interval;
+
+	if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
+				   (char *) &interval, sizeof(interval)) < 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPINTVL) failed: %m");
+		return STATUS_ERROR;
+	}
+
+	port->keepalives_interval = interval;
+#else
+	if (interval != 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPINTVL) not supported");
+		return STATUS_ERROR;
+	}
+#endif
+
+	return STATUS_OK;
+}
+
+int
+pq_getkeepalivescount(Port *port)
+{
+#ifdef TCP_KEEPCNT
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return 0;
+
+	if (port->keepalives_count != 0)
+		return port->keepalives_count;
+
+	if (port->default_keepalives_count == 0)
+	{
+		ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_count);
+
+		if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
+					   (char *) &port->default_keepalives_count,
+					   &size) < 0)
+		{
+			elog(LOG, "getsockopt(TCP_KEEPCNT) failed: %m");
+			port->default_keepalives_count = -1;		/* don't know */
+		}
+	}
+
+	return port->default_keepalives_count;
+#else
+	return 0;
+#endif
+}
+
+int
+pq_setkeepalivescount(int count, Port *port)
+{
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return STATUS_OK;
+
+#ifdef TCP_KEEPCNT
+	if (count == port->keepalives_count)
+		return STATUS_OK;
+
+	if (port->default_keepalives_count <= 0)
+	{
+		if (pq_getkeepalivescount(port) < 0)
+		{
+			if (count == 0)
+				return STATUS_OK;		/* default is set but unknown */
+			else
+				return STATUS_ERROR;
+		}
+	}
+
+	if (count == 0)
+		count = port->default_keepalives_count;
+
+	if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
+				   (char *) &count, sizeof(count)) < 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPCNT) failed: %m");
+		return STATUS_ERROR;
+	}
+
+	port->keepalives_count = count;
+#else
+	if (count != 0)
+	{
+		elog(LOG, "setsockopt(TCP_KEEPCNT) not supported");
+		return STATUS_ERROR;
+	}
+#endif
+
+	return STATUS_OK;
+}
diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c
new file mode 100644
index 0000000000..339f50a995
--- /dev/null
+++ b/src/gtm/libpq/pqformat.c
@@ -0,0 +1,658 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqformat.c
+ *		Routines for formatting and parsing frontend/backend messages
+ *
+ * Outgoing messages are built up in a StringInfo buffer (which is expansible)
+ * and then sent in a single call to pq_putmessage.  This module provides data
+ * formatting/conversion routines that are needed to produce valid messages.
+ * Note in particular the distinction between "raw data" and "text"; raw data
+ * is message protocol characters and binary values that are not subject to
+ * character set conversion, while text is converted by character encoding
+ * rules.
+ *
+ * Incoming messages are similarly read into a StringInfo buffer, via
+ * pq_getmessage, and then parsed and converted from that using the routines
+ * in this module.
+ *
+ * These same routines support reading and writing of external binary formats
+ * (typsend/typreceive routines).  The conversion routines for individual
+ * data types are exactly the same, only initialization and completion
+ * are different.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *	$PostgreSQL: pgsql/src/backend/libpq/pqformat.c,v 1.48 2009/01/01 17:23:42 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * Message assembly and output:
+ *		pq_beginmessage - initialize StringInfo buffer
+ *		pq_sendbyte		- append a raw byte to a StringInfo buffer
+ *		pq_sendint		- append a binary integer to a StringInfo buffer
+ *		pq_sendint64	- append a binary 8-byte int to a StringInfo buffer
+ *		pq_sendfloat4	- append a float4 to a StringInfo buffer
+ *		pq_sendfloat8	- append a float8 to a StringInfo buffer
+ *		pq_sendbytes	- append raw data to a StringInfo buffer
+ *		pq_sendcountedtext - append a counted text string (with character set conversion)
+ *		pq_sendtext		- append a text string (with conversion)
+ *		pq_sendstring	- append a null-terminated text string (with conversion)
+ *		pq_send_ascii_string - append a null-terminated text string (without conversion)
+ *		pq_endmessage	- send the completed message to the frontend
+ * Note: it is also possible to append data to the StringInfo buffer using
+ * the regular StringInfo routines, but this is discouraged since required
+ * character set conversion may not occur.
+ *
+ * typsend support (construct a bytea value containing external binary data):
+ *		pq_begintypsend - initialize StringInfo buffer
+ *		pq_endtypsend	- return the completed string as a "bytea*"
+ *
+ * Special-case message output:
+ *		pq_puttextmessage - generate a character set-converted message in one step
+ *		pq_putemptymessage - convenience routine for message with empty body
+ *
+ * Message parsing after input:
+ *		pq_getmsgbyte	- get a raw byte from a message buffer
+ *		pq_getmsgint	- get a binary integer from a message buffer
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer
+ *		pq_getmsgfloat4 - get a float4 from a message buffer
+ *		pq_getmsgfloat8 - get a float8 from a message buffer
+ *		pq_getmsgbytes	- get raw data from a message buffer
+ *		pq_copymsgbytes - copy raw data from a message buffer
+ *		pq_getmsgtext	- get a counted text string (with conversion)
+ *		pq_getmsgstring - get a null-terminated text string (with conversion)
+ *		pq_getmsgend	- verify message fully consumed
+ *		pq_getmsgunreadlen - get length of the unread data in the message buffer
+ */
+
+#include <sys/param.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm.h"
+#include "gtm/memutils.h"
+#include "gtm/elog.h"
+
+
+/* --------------------------------
+ *		pq_beginmessage		- initialize for sending a message
+ * --------------------------------
+ */
+void
+pq_beginmessage(StringInfo buf, char msgtype)
+{
+	initStringInfo(buf);
+
+	/*
+	 * We stash the message type into the buffer's cursor field, expecting
+	 * that the pq_sendXXX routines won't touch it.  We could alternatively
+	 * make it the first byte of the buffer contents, but this seems easier.
+	 */
+	buf->cursor = msgtype;
+}
+
+/* --------------------------------
+ *		pq_sendbyte		- append a raw byte to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbyte(StringInfo buf, int byt)
+{
+	appendStringInfoCharMacro(buf, byt);
+}
+
+/* --------------------------------
+ *		pq_sendbytes	- append raw data to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbytes(StringInfo buf, const char *data, int datalen)
+{
+	appendBinaryStringInfo(buf, data, datalen);
+}
+
+/* --------------------------------
+ *		pq_sendcountedtext - append a counted text string (with character set conversion)
+ *
+ * The data sent to the frontend by this routine is a 4-byte count field
+ * followed by the string.	The count includes itself or not, as per the
+ * countincludesself flag (pre-3.0 protocol requires it to include itself).
+ * The passed text string need not be null-terminated, and the data sent
+ * to the frontend isn't either.
+ * --------------------------------
+ */
+void
+pq_sendcountedtext(StringInfo buf, const char *str, int slen,
+				   bool countincludesself)
+{
+	int			extra = countincludesself ? 4 : 0;
+	char	   *p;
+
+	if (p != str)				/* actual conversion has been done? */
+	{
+		slen = strlen(p);
+		pq_sendint(buf, slen + extra, 4);
+		appendBinaryStringInfo(buf, p, slen);
+		pfree(p);
+	}
+	else
+	{
+		pq_sendint(buf, slen + extra, 4);
+		appendBinaryStringInfo(buf, str, slen);
+	}
+}
+
+/* --------------------------------
+ *		pq_sendtext		- append a text string (with conversion)
+ *
+ * The passed text string need not be null-terminated, and the data sent
+ * to the frontend isn't either.  Note that this is not actually useful
+ * for direct frontend transmissions, since there'd be no way for the
+ * frontend to determine the string length.  But it is useful for binary
+ * format conversions.
+ * --------------------------------
+ */
+void
+pq_sendtext(StringInfo buf, const char *str, int slen)
+{
+	char	   *p;
+
+	if (p != str)				/* actual conversion has been done? */
+	{
+		slen = strlen(p);
+		appendBinaryStringInfo(buf, p, slen);
+		pfree(p);
+	}
+	else
+		appendBinaryStringInfo(buf, str, slen);
+}
+
+/* --------------------------------
+ *		pq_sendstring	- append a null-terminated text string (with conversion)
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_sendstring(StringInfo buf, const char *str)
+{
+	int			slen = strlen(str);
+	appendBinaryStringInfo(buf, str, slen + 1);
+}
+
+/* --------------------------------
+ *		pq_send_ascii_string	- append a null-terminated text string (without conversion)
+ *
+ * This function intentionally bypasses encoding conversion, instead just
+ * silently replacing any non-7-bit-ASCII characters with question marks.
+ * It is used only when we are having trouble sending an error message to
+ * the client with normal localization and encoding conversion.  The caller
+ * should already have taken measures to ensure the string is just ASCII;
+ * the extra work here is just to make certain we don't send a badly encoded
+ * string to the client (which might or might not be robust about that).
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_send_ascii_string(StringInfo buf, const char *str)
+{
+	while (*str)
+	{
+		char	ch = *str++;
+
+		if (IS_HIGHBIT_SET(ch))
+			ch = '?';
+		appendStringInfoCharMacro(buf, ch);
+	}
+	appendStringInfoChar(buf, '\0');
+}
+
+/* --------------------------------
+ *		pq_sendint		- append a binary integer to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendint(StringInfo buf, int i, int b)
+{
+	unsigned char n8;
+	uint16		n16;
+	uint32		n32;
+
+	switch (b)
+	{
+		case 1:
+			n8 = (unsigned char) i;
+			appendBinaryStringInfo(buf, (char *) &n8, 1);
+			break;
+		case 2:
+			n16 = htons((uint16) i);
+			appendBinaryStringInfo(buf, (char *) &n16, 2);
+			break;
+		case 4:
+			n32 = htonl((uint32) i);
+			appendBinaryStringInfo(buf, (char *) &n32, 4);
+			break;
+		default:
+			elog(ERROR, "unsupported integer size %d", b);
+			break;
+	}
+}
+
+/* --------------------------------
+ *		pq_sendint64	- append a binary 8-byte int to a StringInfo buffer
+ *
+ * It is tempting to merge this with pq_sendint, but we'd have to make the
+ * argument int64 for all data widths --- that could be a big performance
+ * hit on machines where int64 isn't efficient.
+ * --------------------------------
+ */
+void
+pq_sendint64(StringInfo buf, int64 i)
+{
+	uint32		n32;
+
+	/* High order half first, since we're doing MSB-first */
+#ifdef INT64_IS_BUSTED
+	/* don't try a right shift of 32 on a 32-bit word */
+	n32 = (i < 0) ? -1 : 0;
+#else
+	n32 = (uint32) (i >> 32);
+#endif
+	n32 = htonl(n32);
+	appendBinaryStringInfo(buf, (char *) &n32, 4);
+
+	/* Now the low order half */
+	n32 = (uint32) i;
+	n32 = htonl(n32);
+	appendBinaryStringInfo(buf, (char *) &n32, 4);
+}
+
+/* --------------------------------
+ *		pq_sendfloat4	- append a float4 to a StringInfo buffer
+ *
+ * The point of this routine is to localize knowledge of the external binary
+ * representation of float4, which is a component of several datatypes.
+ *
+ * We currently assume that float4 should be byte-swapped in the same way
+ * as int4.  This rule is not perfect but it gives us portability across
+ * most IEEE-float-using architectures.
+ * --------------------------------
+ */
+void
+pq_sendfloat4(StringInfo buf, float4 f)
+{
+	union
+	{
+		float4		f;
+		uint32		i;
+	}			swap;
+
+	swap.f = f;
+	swap.i = htonl(swap.i);
+
+	appendBinaryStringInfo(buf, (char *) &swap.i, 4);
+}
+
+/* --------------------------------
+ *		pq_sendfloat8	- append a float8 to a StringInfo buffer
+ *
+ * The point of this routine is to localize knowledge of the external binary
+ * representation of float8, which is a component of several datatypes.
+ *
+ * We currently assume that float8 should be byte-swapped in the same way
+ * as int8.  This rule is not perfect but it gives us portability across
+ * most IEEE-float-using architectures.
+ * --------------------------------
+ */
+void
+pq_sendfloat8(StringInfo buf, float8 f)
+{
+#ifdef INT64_IS_BUSTED
+	union
+	{
+		float8		f;
+		uint32		h[2];
+	}			swap;
+
+	swap.f = f;
+	swap.h[0] = htonl(swap.h[0]);
+	swap.h[1] = htonl(swap.h[1]);
+
+#ifdef WORDS_BIGENDIAN
+	/* machine seems to be big-endian, send h[0] first */
+	appendBinaryStringInfo(buf, (char *) &swap.h[0], 4);
+	appendBinaryStringInfo(buf, (char *) &swap.h[1], 4);
+#else
+	/* machine seems to be little-endian, send h[1] first */
+	appendBinaryStringInfo(buf, (char *) &swap.h[1], 4);
+	appendBinaryStringInfo(buf, (char *) &swap.h[0], 4);
+#endif
+#else							/* INT64 works */
+	union
+	{
+		float8		f;
+		int64		i;
+	}			swap;
+
+	swap.f = f;
+	pq_sendint64(buf, swap.i);
+#endif
+}
+
+/* --------------------------------
+ *		pq_endmessage	- send the completed message to the frontend
+ *
+ * The data buffer is pfree()d, but if the StringInfo was allocated with
+ * makeStringInfo then the caller must still pfree it.
+ * --------------------------------
+ */
+void
+pq_endmessage(Port *myport, StringInfo buf)
+{
+	/* msgtype was saved in cursor field */
+	(void) pq_putmessage(myport, buf->cursor, buf->data, buf->len);
+	/* no need to complain about any failure, since pqcomm.c already did */
+	pfree(buf->data);
+	buf->data = NULL;
+}
+
+
+/* --------------------------------
+ *		pq_puttextmessage - generate a character set-converted message in one step
+ *
+ *		This is the same as the pqcomm.c routine pq_putmessage, except that
+ *		the message body is a null-terminated string to which encoding
+ *		conversion applies.
+ * --------------------------------
+ */
+void
+pq_puttextmessage(Port *myport, char msgtype, const char *str)
+{
+	int			slen = strlen(str);
+	(void) pq_putmessage(myport, msgtype, str, slen + 1);
+}
+
+
+/* --------------------------------
+ *		pq_putemptymessage - convenience routine for message with empty body
+ * --------------------------------
+ */
+void
+pq_putemptymessage(Port *myport, char msgtype)
+{
+	(void) pq_putmessage(myport, msgtype, NULL, 0);
+}
+
+
+/* --------------------------------
+ *		pq_getmsgbyte	- get a raw byte from a message buffer
+ * --------------------------------
+ */
+int
+pq_getmsgbyte(StringInfo msg)
+{
+	if (msg->cursor >= msg->len)
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("no data left in message")));
+	return (unsigned char) msg->data[msg->cursor++];
+}
+
+/* --------------------------------
+ *		pq_getmsgint	- get a binary integer from a message buffer
+ *
+ *		Values are treated as unsigned.
+ * --------------------------------
+ */
+unsigned int
+pq_getmsgint(StringInfo msg, int b)
+{
+	unsigned int result;
+	unsigned char n8;
+	uint16		n16;
+	uint32		n32;
+
+	switch (b)
+	{
+		case 1:
+			pq_copymsgbytes(msg, (char *) &n8, 1);
+			result = n8;
+			break;
+		case 2:
+			pq_copymsgbytes(msg, (char *) &n16, 2);
+			result = ntohs(n16);
+			break;
+		case 4:
+			pq_copymsgbytes(msg, (char *) &n32, 4);
+			result = ntohl(n32);
+			break;
+		default:
+			elog(ERROR, "unsupported integer size %d", b);
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+	return result;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer
+ *
+ * It is tempting to merge this with pq_getmsgint, but we'd have to make the
+ * result int64 for all data widths --- that could be a big performance
+ * hit on machines where int64 isn't efficient.
+ * --------------------------------
+ */
+int64
+pq_getmsgint64(StringInfo msg)
+{
+	int64		result;
+	uint32		h32;
+	uint32		l32;
+
+	pq_copymsgbytes(msg, (char *) &h32, 4);
+	pq_copymsgbytes(msg, (char *) &l32, 4);
+	h32 = ntohl(h32);
+	l32 = ntohl(l32);
+
+#ifdef INT64_IS_BUSTED
+	/* error out if incoming value is wider than 32 bits */
+	result = l32;
+	if ((result < 0) ? (h32 != -1) : (h32 != 0))
+		ereport(ERROR,
+				(ERANGE,
+				 errmsg("binary value is out of range for type bigint")));
+#else
+	result = h32;
+	result <<= 32;
+	result |= l32;
+#endif
+
+	return result;
+}
+
+/* --------------------------------
+ *		pq_getmsgfloat4 - get a float4 from a message buffer
+ *
+ * See notes for pq_sendfloat4.
+ * --------------------------------
+ */
+float4
+pq_getmsgfloat4(StringInfo msg)
+{
+	union
+	{
+		float4		f;
+		uint32		i;
+	}			swap;
+
+	swap.i = pq_getmsgint(msg, 4);
+	return swap.f;
+}
+
+/* --------------------------------
+ *		pq_getmsgfloat8 - get a float8 from a message buffer
+ *
+ * See notes for pq_sendfloat8.
+ * --------------------------------
+ */
+float8
+pq_getmsgfloat8(StringInfo msg)
+{
+#ifdef INT64_IS_BUSTED
+	union
+	{
+		float8		f;
+		uint32		h[2];
+	}			swap;
+
+#ifdef WORDS_BIGENDIAN
+	/* machine seems to be big-endian, receive h[0] first */
+	swap.h[0] = pq_getmsgint(msg, 4);
+	swap.h[1] = pq_getmsgint(msg, 4);
+#else
+	/* machine seems to be little-endian, receive h[1] first */
+	swap.h[1] = pq_getmsgint(msg, 4);
+	swap.h[0] = pq_getmsgint(msg, 4);
+#endif
+	return swap.f;
+#else							/* INT64 works */
+	union
+	{
+		float8		f;
+		int64		i;
+	}			swap;
+
+	swap.i = pq_getmsgint64(msg);
+	return swap.f;
+#endif
+}
+
+/* --------------------------------
+ *		pq_getmsgbytes	- get raw data from a message buffer
+ *
+ *		Returns a pointer directly into the message buffer; note this
+ *		may not have any particular alignment.
+ * --------------------------------
+ */
+const char *
+pq_getmsgbytes(StringInfo msg, int datalen)
+{
+	const char *result;
+
+	if (datalen < 0 || datalen > (msg->len - msg->cursor))
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("insufficient data left in message")));
+	result = &msg->data[msg->cursor];
+	msg->cursor += datalen;
+	return result;
+}
+
+/* --------------------------------
+ *		pq_copymsgbytes - copy raw data from a message buffer
+ *
+ *		Same as above, except data is copied to caller's buffer.
+ * --------------------------------
+ */
+void
+pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
+{
+	if (datalen < 0 || datalen > (msg->len - msg->cursor))
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("insufficient data left in message")));
+	memcpy(buf, &msg->data[msg->cursor], datalen);
+	msg->cursor += datalen;
+}
+
+/* --------------------------------
+ *		pq_getmsgtext	- get a counted text string (with conversion)
+ *
+ *		Always returns a pointer to a freshly palloc'd result.
+ *		The result has a trailing null, *and* we return its strlen in *nbytes.
+ * --------------------------------
+ */
+char *
+pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes)
+{
+	char	   *str;
+	char	   *p;
+
+	if (rawbytes < 0 || rawbytes > (msg->len - msg->cursor))
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("insufficient data left in message")));
+	str = &msg->data[msg->cursor];
+	msg->cursor += rawbytes;
+
+	p = (char *) palloc(rawbytes + 1);
+	memcpy(p, str, rawbytes);
+	p[rawbytes] = '\0';
+	*nbytes = rawbytes;
+	return p;
+}
+
+/* --------------------------------
+ *		pq_getmsgstring - get a null-terminated text string (with conversion)
+ *
+ *		May return a pointer directly into the message buffer, or a pointer
+ *		to a palloc'd conversion result.
+ * --------------------------------
+ */
+const char *
+pq_getmsgstring(StringInfo msg)
+{
+	char	   *str;
+	int			slen;
+
+	str = &msg->data[msg->cursor];
+
+	/*
+	 * It's safe to use strlen() here because a StringInfo is guaranteed to
+	 * have a trailing null byte.  But check we found a null inside the
+	 * message.
+	 */
+	slen = strlen(str);
+	if (msg->cursor + slen >= msg->len)
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("invalid string in message")));
+	msg->cursor += slen + 1;
+
+	return str;
+}
+
+/* --------------------------------
+ *		pq_getmsgend	- verify message fully consumed
+ * --------------------------------
+ */
+void
+pq_getmsgend(StringInfo msg)
+{
+	if (msg->cursor != msg->len)
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("invalid message format")));
+}
+
+/* --------------------------------
+ * 		pq_getmsgunreadlen - get length of the unread data in the message
+ * 		buffer
+ * --------------------------------
+ */
+int
+pq_getmsgunreadlen(StringInfo msg)
+{
+	return msg->len - msg->cursor;
+}
diff --git a/src/gtm/libpq/pqsignal.c b/src/gtm/libpq/pqsignal.c
new file mode 100644
index 0000000000..6bff3d4e14
--- /dev/null
+++ b/src/gtm/libpq/pqsignal.c
@@ -0,0 +1,181 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqsignal.c
+ *	  reliable BSD-style signal(2) routine stolen from RWW who stole it
+ *	  from Stevens...
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/libpq/pqsignal.c,v 1.44 2008/01/01 19:45:49 momjian Exp $
+ *
+ * NOTES
+ *		This shouldn't be in libpq, but the monitor and some other
+ *		things need it...
+ *
+ *	A NOTE ABOUT SIGNAL HANDLING ACROSS THE VARIOUS PLATFORMS.
+ *
+ *	pg_config.h defines the macro HAVE_POSIX_SIGNALS for some platforms and
+ *	not for others.  This file and pqsignal.h use that macro to decide
+ *	how to handle signalling.
+ *
+ *	signal(2) handling - this is here because it affects some of
+ *	the frontend commands as well as the backend server.
+ *
+ *	Ultrix and SunOS provide BSD signal(2) semantics by default.
+ *
+ *	SVID2 and POSIX signal(2) semantics differ from BSD signal(2)
+ *	semantics.	We can use the POSIX sigaction(2) on systems that
+ *	allow us to request restartable signals (SA_RESTART).
+ *
+ *	Some systems don't allow restartable signals at all unless we
+ *	link to a special BSD library.
+ *
+ *	We devoutly hope that there aren't any systems that provide
+ *	neither POSIX signals nor BSD signals.	The alternative
+ *	is to do signal-handler reinstallation, which doesn't work well
+ *	at all.
+ * ------------------------------------------------------------------------*/
+
+#include "gtm/gtm.h"
+
+#include <signal.h>
+
+#include "gtm/pqsignal.h"
+
+
+#ifdef HAVE_SIGPROCMASK
+sigset_t	UnBlockSig,
+			BlockSig,
+			AuthBlockSig;
+#else
+int			UnBlockSig,
+			BlockSig,
+			AuthBlockSig;
+#endif
+
+
+/*
+ * Initialize BlockSig, UnBlockSig, and AuthBlockSig.
+ *
+ * BlockSig is the set of signals to block when we are trying to block
+ * signals.  This includes all signals we normally expect to get, but NOT
+ * signals that should never be turned off.
+ *
+ * AuthBlockSig is the set of signals to block during authentication;
+ * it's essentially BlockSig minus SIGTERM, SIGQUIT, SIGALRM.
+ *
+ * UnBlockSig is the set of signals to block when we don't want to block
+ * signals (is this ever nonzero??)
+ */
+void
+pqinitmask(void)
+{
+#ifdef HAVE_SIGPROCMASK
+
+	sigemptyset(&UnBlockSig);
+
+	/* First set all signals, then clear some. */
+	sigfillset(&BlockSig);
+	sigfillset(&AuthBlockSig);
+
+	/*
+	 * Unmark those signals that should never be blocked. Some of these signal
+	 * names don't exist on all platforms.  Most do, but might as well ifdef
+	 * them all for consistency...
+	 */
+#ifdef SIGTRAP
+	sigdelset(&BlockSig, SIGTRAP);
+	sigdelset(&AuthBlockSig, SIGTRAP);
+#endif
+#ifdef SIGABRT
+	sigdelset(&BlockSig, SIGABRT);
+	sigdelset(&AuthBlockSig, SIGABRT);
+#endif
+#ifdef SIGILL
+	sigdelset(&BlockSig, SIGILL);
+	sigdelset(&AuthBlockSig, SIGILL);
+#endif
+#ifdef SIGFPE
+	sigdelset(&BlockSig, SIGFPE);
+	sigdelset(&AuthBlockSig, SIGFPE);
+#endif
+#ifdef SIGSEGV
+	sigdelset(&BlockSig, SIGSEGV);
+	sigdelset(&AuthBlockSig, SIGSEGV);
+#endif
+#ifdef SIGBUS
+	sigdelset(&BlockSig, SIGBUS);
+	sigdelset(&AuthBlockSig, SIGBUS);
+#endif
+#ifdef SIGSYS
+	sigdelset(&BlockSig, SIGSYS);
+	sigdelset(&AuthBlockSig, SIGSYS);
+#endif
+#ifdef SIGCONT
+	sigdelset(&BlockSig, SIGCONT);
+	sigdelset(&AuthBlockSig, SIGCONT);
+#endif
+
+/* Signals unique to Auth */
+#ifdef SIGQUIT
+	sigdelset(&AuthBlockSig, SIGQUIT);
+#endif
+#ifdef SIGTERM
+	sigdelset(&AuthBlockSig, SIGTERM);
+#endif
+#ifdef SIGALRM
+	sigdelset(&AuthBlockSig, SIGALRM);
+#endif
+#else
+	/* Set the signals we want. */
+	UnBlockSig = 0;
+	BlockSig = sigmask(SIGQUIT) |
+		sigmask(SIGTERM) | sigmask(SIGALRM) |
+	/* common signals between two */
+		sigmask(SIGHUP) |
+		sigmask(SIGINT) | sigmask(SIGUSR1) |
+		sigmask(SIGUSR2) | sigmask(SIGCHLD) |
+		sigmask(SIGWINCH) | sigmask(SIGFPE);
+	AuthBlockSig = sigmask(SIGHUP) |
+		sigmask(SIGINT) | sigmask(SIGUSR1) |
+		sigmask(SIGUSR2) | sigmask(SIGCHLD) |
+		sigmask(SIGWINCH) | sigmask(SIGFPE);
+#endif
+}
+
+
+/* Win32 signal handling is in backend/port/win32/signal.c */
+#ifndef WIN32
+
+/*
+ * Set up a signal handler
+ */
+pqsigfunc
+pqsignal(int signo, pqsigfunc func)
+{
+#if !defined(HAVE_POSIX_SIGNALS)
+	return signal(signo, func);
+#else
+	struct sigaction act,
+				oact;
+
+	act.sa_handler = func;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	if (signo != SIGALRM)
+		act.sa_flags |= SA_RESTART;
+#ifdef SA_NOCLDSTOP
+	if (signo == SIGCHLD)
+		act.sa_flags |= SA_NOCLDSTOP;
+#endif
+	if (sigaction(signo, &act, &oact) < 0)
+		return SIG_ERR;
+	return oact.sa_handler;
+#endif   /* !HAVE_POSIX_SIGNALS */
+}
+
+#endif   /* WIN32 */
diff --git a/src/gtm/libpq/strlcpy.c b/src/gtm/libpq/strlcpy.c
new file mode 100644
index 0000000000..ae031e244c
--- /dev/null
+++ b/src/gtm/libpq/strlcpy.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * strlcpy.c
+ *	  strncpy done right
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $
+ *
+ * This file was taken from OpenBSD and is used on platforms that don't
+ * provide strlcpy().  The OpenBSD copyright terms follow.
+ *-------------------------------------------------------------------------
+ */
+
+/*	$OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $	*/
+
+/*
+ * Copyright (c) 1998 Todd C. Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "gtm/gtm_c.h"
+
+
+/*
+ * Copy src to string dst of size siz.	At most siz-1 characters
+ * will be copied.	Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ * Function creation history:  http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+	char	   *d = dst;
+	const char *s = src;
+	size_t		n = siz;
+
+	/* Copy as many bytes as will fit */
+	if (n != 0)
+	{
+		while (--n != 0)
+		{
+			if ((*d++ = *s++) == '\0')
+				break;
+		}
+	}
+
+	/* Not enough room in dst, add NUL and traverse rest of src */
+	if (n == 0)
+	{
+		if (siz != 0)
+			*d = '\0';			/* NUL-terminate dst */
+		while (*s++)
+			;
+	}
+
+	return (s - src - 1);		/* count does not include NUL */
+}
diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile
new file mode 100644
index 0000000000..7fcdf82a83
--- /dev/null
+++ b/src/gtm/main/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o ../common/libgtm.a ../libpq/libpqcomm.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm:$(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm
+
+all:gtm
+
+clean:
+	rm -f $(OBJS)
+	rm -f gtm
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
new file mode 100644
index 0000000000..73af34efd6
--- /dev/null
+++ b/src/gtm/main/gtm_seq.c
@@ -0,0 +1,867 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_seq.c
+ *	Sequence handling on GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+#include <unistd.h>
+
+typedef struct GTM_SeqInfoHashBucket
+{
+	List		*shb_list;
+	GTM_RWLock	shb_lock;
+} GTM_SeqInfoHashBucket;
+
+static int SeqStartMagic =	0xfafafafa;
+static int SeqEndMagic =	0xfefefefe;
+
+#define SEQ_HASH_TABLE_SIZE		1024
+static GTM_SeqInfoHashBucket GTMSequences[SEQ_HASH_TABLE_SIZE];
+
+static uint32 seq_gethash(GTM_SequenceKey key);
+static bool seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2);
+static GTM_SeqInfo *seq_find_seqinfo(GTM_SequenceKey seqkey);
+static int seq_release_seqinfo(GTM_SeqInfo *seqinfo);
+static int seq_add_seqinfo(GTM_SeqInfo *seqinfo);
+static int seq_remove_seqinfo(GTM_SeqInfo *seqinfo);
+static GTM_SequenceKey seq_copy_key(GTM_SequenceKey key);
+
+/*
+ * Get the hash value given the sequence key
+ *
+ * XXX This should probably be replaced by a better hash function.
+ */
+static uint32
+seq_gethash(GTM_SequenceKey key)
+{
+	uint32 total = 0;
+	int ii;
+
+	for (ii = 0; ii < key->gsk_keylen; ii++)
+		total += key->gsk_key[ii];
+	return (total % SEQ_HASH_TABLE_SIZE);
+}
+
+/*
+ * Return true if both keys are equal, else return false
+ */
+static bool
+seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2)
+{
+	Assert(key1);
+	Assert(key2);
+
+	if (key1->gsk_keylen != key2->gsk_keylen) return false;
+
+	return (memcmp(key1->gsk_key, key2->gsk_key,
+				  Min(key1->gsk_keylen, key2->gsk_keylen)) == 0);
+}
+
+/*
+ * Find the seqinfo structure for the given key. The reference count is
+ * incremented before structure is returned. The caller must release the
+ * reference to the structure when done with it
+ */
+static GTM_SeqInfo *
+seq_find_seqinfo(GTM_SequenceKey seqkey)
+{
+	uint32 hash = seq_gethash(seqkey);
+	GTM_SeqInfoHashBucket *bucket;
+	ListCell *elem;
+	GTM_SeqInfo *curr_seqinfo = NULL;
+
+	bucket = &GTMSequences[hash];
+
+	GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ);
+
+	foreach(elem, bucket->shb_list)
+	{
+		curr_seqinfo = (GTM_SeqInfo *) lfirst(elem);
+		if (seq_keys_equal(curr_seqinfo->gs_key, seqkey))
+			break;
+		curr_seqinfo = NULL;
+	}
+
+	if (curr_seqinfo != NULL)
+	{
+		GTM_RWLockAcquire(&curr_seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+		if (curr_seqinfo->gs_state != SEQ_STATE_ACTIVE)
+		{
+			elog(LOG, "Sequence not active");
+			GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+			return NULL;
+		}
+		Assert(curr_seqinfo->gs_ref_count != SEQ_MAX_REFCOUNT);
+		curr_seqinfo->gs_ref_count++;
+		GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+	}
+	GTM_RWLockRelease(&bucket->shb_lock);
+
+	return curr_seqinfo;
+}
+
+/*
+ * Release previously grabbed reference to the structure. If the structure is
+ * marked for deletion, it will be removed from the global array and released
+ */
+static int
+seq_release_seqinfo(GTM_SeqInfo *seqinfo)
+{
+	bool remove = false;
+
+	GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+	Assert(seqinfo->gs_ref_count > 0);
+	seqinfo->gs_ref_count--;
+	
+	if ((seqinfo->gs_state == SEQ_STATE_DELETED) &&
+		(seqinfo->gs_ref_count == 0))
+		remove = true;
+
+	GTM_RWLockRelease(&seqinfo->gs_lock);
+	/*
+	 * Remove the structure from the global hash table
+	 */
+	if (remove) seq_remove_seqinfo(seqinfo);
+	return 0;
+}
+
+/*
+ * Add a seqinfo structure to the global hash table.
+ */
+static int
+seq_add_seqinfo(GTM_SeqInfo *seqinfo)
+{
+	uint32 hash = seq_gethash(seqinfo->gs_key);
+	GTM_SeqInfoHashBucket	*bucket;
+	ListCell *elem;
+
+	bucket = &GTMSequences[hash];
+
+	GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE);
+
+	foreach(elem, bucket->shb_list)
+	{
+		GTM_SeqInfo *curr_seqinfo = NULL;
+		curr_seqinfo = (GTM_SeqInfo *) lfirst(elem);
+
+		if (seq_keys_equal(curr_seqinfo->gs_key, seqinfo->gs_key))
+		{
+			GTM_RWLockRelease(&bucket->shb_lock);
+			ereport(LOG,
+					(EEXIST,
+					 errmsg("Sequence with the given key already exists")));
+			return EEXIST;
+		}
+	}
+
+	/*
+	 * Safe to add the structure to the list
+	 */
+	bucket->shb_list = lappend(bucket->shb_list, seqinfo);
+	GTM_RWLockRelease(&bucket->shb_lock);
+
+	return 0;
+}
+
+/*
+ * Remove the seqinfo structure from the global hash table. If the structure is
+ * currently referenced by some other thread, just mark the structure for
+ * deletion and it will be deleted by the final reference is released.
+ */
+static int
+seq_remove_seqinfo(GTM_SeqInfo *seqinfo)
+{
+	uint32 hash = seq_gethash(seqinfo->gs_key);
+	GTM_SeqInfoHashBucket	*bucket;
+	
+	bucket = &GTMSequences[hash];
+
+	GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE);
+	GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+	if (seqinfo->gs_ref_count > 1)
+	{
+		seqinfo->gs_state = SEQ_STATE_DELETED;
+		GTM_RWLockRelease(&seqinfo->gs_lock);
+		GTM_RWLockRelease(&bucket->shb_lock);
+		return EBUSY;
+	}
+
+	bucket->shb_list = list_delete(bucket->shb_list, seqinfo);
+	GTM_RWLockRelease(&seqinfo->gs_lock);
+	GTM_RWLockRelease(&bucket->shb_lock);
+
+	return 0;
+}
+
+static GTM_SequenceKey
+seq_copy_key(GTM_SequenceKey key)
+{
+	GTM_SequenceKey retkey = NULL;
+   
+	/*
+	 * We must use the TopMostMemoryContext because the sequence information is
+	 * not bound to a thread and can outlive any of the thread specific
+	 * contextes.
+	 */
+	retkey = (GTM_SequenceKey) MemoryContextAlloc(TopMostMemoryContext,
+												  sizeof(GTM_SequenceKeyData) +
+														key->gsk_keylen);
+
+	if (retkey == NULL)
+		ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+	retkey->gsk_keylen = key->gsk_keylen;
+	retkey->gsk_key = (char *)((char *)retkey + sizeof (GTM_SequenceKeyData));
+
+	memcpy(retkey->gsk_key, key->gsk_key, key->gsk_keylen);
+	return retkey;
+}
+
+/*
+ * Initialize a new sequence. Optionally set the initial value of the sequence.
+ */
+int
+GTM_SeqOpen(GTM_SequenceKey seqkey,
+			GTM_Sequence increment_by,
+			GTM_Sequence minval,
+			GTM_Sequence maxval,
+			GTM_Sequence startval,
+			bool cycle)
+{
+	GTM_SeqInfo *seqinfo = NULL;
+	int errcode = 0;
+	seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo));
+
+	if (seqinfo == NULL)
+		ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+	GTM_RWLockInit(&seqinfo->gs_lock);
+
+	seqinfo->gs_ref_count = 0;
+	seqinfo->gs_key = seq_copy_key(seqkey);
+	seqinfo->gs_state = SEQ_STATE_ACTIVE;
+	seqinfo->gs_called = false;
+
+	/*
+	 * Set the increment. Default is 1
+	 */
+	if (SEQVAL_IS_VALID(increment_by))
+		seqinfo->gs_increment_by = increment_by;
+	else
+		seqinfo->gs_increment_by = 1;
+
+	/*
+	 * If minval is specified, set the minvalue to the given minval,
+	 * otherwise set to the defaults
+	 */
+	if (SEQVAL_IS_VALID(minval))
+		seqinfo->gs_min_value = minval;
+	else if (SEQ_IS_ASCENDING(seqinfo))
+		seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_ASCEND;
+	else
+		seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_DESCEND;
+
+	/*
+	 * If maxval is specfied, set the maxvalue to the given maxval, otherwise
+	 * set to the defaults depending on whether the seqeunce is ascending or
+	 * descending. Also do some basic contraint checks
+	 */	
+	if (SEQVAL_IS_VALID(maxval))
+	{
+		if (maxval < seqinfo->gs_min_value)
+			ereport(ERROR,
+					(ERANGE,
+					 errmsg("Max value must be greater than min value")));
+		seqinfo->gs_max_value = maxval;
+	}
+	else if (SEQ_IS_ASCENDING(seqinfo))
+		seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_ASCEND;
+	else
+		seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_DESCEND;
+
+
+	/*
+	 * Set the startval if specified. Do some basic checks like startval must
+	 * be in-between min and max values
+	 */
+	if (SEQVAL_IS_VALID(startval))
+	{
+		if (startval < seqinfo->gs_min_value)
+			ereport(ERROR,
+					(ERANGE,
+					 errmsg("Start value must be greater than or equal to the min value")));
+
+		if (startval > seqinfo->gs_max_value)
+			ereport(ERROR,
+					(ERANGE,
+					 errmsg("Start value must be less than or equal to the max value")));
+
+		seqinfo->gs_init_value = seqinfo->gs_value = startval;
+	}
+	else if (SEQ_IS_ASCENDING(seqinfo))
+		seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_ASCEND;
+	else
+		seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_DESCEND;
+
+	/*
+	 * Should we wrap around ?
+	 */
+	seqinfo->gs_cycle = cycle;
+
+	if ((errcode = seq_add_seqinfo(seqinfo)))
+	{
+	 	GTM_RWLockDestroy(&seqinfo->gs_lock);
+		pfree(seqinfo->gs_key);
+		pfree(seqinfo);
+	}
+	return errcode;
+}
+
+/*
+ * Restore a sequence.
+ */
+static int
+GTM_SeqRestore(GTM_SequenceKey seqkey,
+			GTM_Sequence increment_by,
+			GTM_Sequence minval,
+			GTM_Sequence maxval,
+			GTM_Sequence startval,
+			GTM_Sequence curval,
+			int32 state,
+			bool cycle,
+			bool called)
+{
+	GTM_SeqInfo *seqinfo = NULL;
+	int errcode = 0;
+	seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo));
+
+	if (seqinfo == NULL)
+		ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+	GTM_RWLockInit(&seqinfo->gs_lock);
+
+	seqinfo->gs_ref_count = 0;
+	seqinfo->gs_key = seq_copy_key(seqkey);
+	seqinfo->gs_state = state;
+	seqinfo->gs_called = called;
+
+	seqinfo->gs_increment_by = increment_by;
+	seqinfo->gs_min_value = minval;
+	seqinfo->gs_max_value = maxval;
+
+	seqinfo->gs_init_value = startval;
+	seqinfo->gs_value = curval;
+
+	/*
+	 * Should we wrap around ?
+	 */
+	seqinfo->gs_cycle = cycle;
+
+	if ((errcode = seq_add_seqinfo(seqinfo)))
+	{
+	 	GTM_RWLockDestroy(&seqinfo->gs_lock);
+		pfree(seqinfo->gs_key);
+		pfree(seqinfo);
+	}
+	return errcode;
+}
+/*
+ * Destroy the given sequence
+ */
+int
+GTM_SeqClose(GTM_SequenceKey seqkey)
+{
+	GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+	if (seqinfo != NULL)
+	{
+		seq_remove_seqinfo(seqinfo);
+		pfree(seqinfo->gs_key);
+		pfree(seqinfo);
+		return 0;
+	}
+	else
+		return EINVAL;
+}
+
+/*
+ * Get current value for the sequence without incrementing it
+ */
+GTM_Sequence
+GTM_SeqGetCurrent(GTM_SequenceKey seqkey)
+{
+	GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+	GTM_Sequence value;
+
+	if (seqinfo == NULL)
+	{
+		ereport(LOG,
+				(EINVAL,
+				 errmsg("The sequence with the given key does not exist")));
+		return EINVAL;
+	}
+
+	GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+	/*
+	 * If this is the first call to the sequence, set the value to the start
+	 * value and mark the sequence as 'called'
+	 */
+	if (!SEQ_IS_CALLED(seqinfo))
+	{
+		seqinfo->gs_value = seqinfo->gs_init_value;
+		seqinfo->gs_called = true;
+	}
+	value = seqinfo->gs_value;
+	GTM_RWLockRelease(&seqinfo->gs_lock);
+	seq_release_seqinfo(seqinfo);
+	return value;
+}
+
+/*
+ * Get next vlaue for the sequence
+ */
+GTM_Sequence
+GTM_SeqGetNext(GTM_SequenceKey seqkey)
+{
+	GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+	GTM_Sequence value;
+
+	if (seqinfo == NULL)
+	{
+		ereport(LOG,
+				(EINVAL,
+				 errmsg("The sequence with the given key does not exist")));
+		return EINVAL;
+	}
+
+	GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+	
+	/*
+	 * If the sequence is called for the first time, initialize the value and
+	 * return the start value
+	 */
+	if (!SEQ_IS_CALLED(seqinfo))
+	{
+		value = seqinfo->gs_value = seqinfo->gs_init_value;
+		seqinfo->gs_called = true;
+		GTM_RWLockRelease(&seqinfo->gs_lock);
+		seq_release_seqinfo(seqinfo);
+		return value;
+	}
+
+	if (SEQ_IS_ASCENDING(seqinfo))
+	{
+		/*
+		 * Check if the sequence is about to wrap-around. If the sequence does
+		 * not support wrap-around, throw an error and return
+		 * InvalidSequenceValue
+		 */
+		if (seqinfo->gs_max_value - seqinfo->gs_increment_by >= seqinfo->gs_value)
+			value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by;
+		else if (SEQ_IS_CYCLE(seqinfo))
+			value = seqinfo->gs_value = seqinfo->gs_min_value;
+		else
+		{
+			GTM_RWLockRelease(&seqinfo->gs_lock);
+			seq_release_seqinfo(seqinfo);
+			ereport(LOG,
+					(ERANGE,
+					 errmsg("Sequence reached maximum value")));
+			return InvalidSequenceValue;
+		}
+	}
+	else
+	{
+		/*
+		 * Check if the sequence is about to wrap-around. If the sequence does
+		 * not support wrap-around, throw an error and return
+		 * InvalidSequenceValue, otherwise wrap around the sequence and reset
+		 * it to the max value.
+		 *
+		 * Note: The gs_increment_by is a signed integer and is negative for
+		 * descending sequences. So we don't need special handling below
+		 */
+		if (seqinfo->gs_min_value - seqinfo->gs_increment_by <= seqinfo->gs_value)
+			value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by;
+		else if (SEQ_IS_CYCLE(seqinfo))
+			value = seqinfo->gs_value = seqinfo->gs_max_value;
+		else
+		{
+			GTM_RWLockRelease(&seqinfo->gs_lock);
+			seq_release_seqinfo(seqinfo);
+			ereport(LOG,
+					(ERANGE,
+					 errmsg("Sequence reached minimum value")));
+			return InvalidSequenceValue;
+		}
+
+	}
+	GTM_RWLockRelease(&seqinfo->gs_lock);
+	seq_release_seqinfo(seqinfo);
+	return value;
+}
+
+/*
+ * Reset the sequence
+ */
+int
+GTM_SeqReset(GTM_SequenceKey seqkey)
+{
+	GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+
+	if (seqinfo == NULL)
+	{
+		ereport(LOG,
+				(EINVAL,
+				 errmsg("The sequence with the given key does not exist")));
+		return EINVAL;
+	}
+
+	GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+	seqinfo->gs_value = seqinfo->gs_init_value;
+	GTM_RWLockRelease(&seqinfo->gs_lock);
+
+	seq_release_seqinfo(seqinfo);
+	return 0;
+}
+
+void
+GTM_InitSeqManager(void)
+{
+	int ii;
+
+	for (ii = 0; ii < SEQ_HASH_TABLE_SIZE; ii++)
+	{
+		GTMSequences[ii].shb_list = NIL;
+		GTM_RWLockInit(&GTMSequences[ii].shb_lock);
+	}
+}
+
+/*
+ * Process MSG_SEQUENCE_INIT message
+ */
+void
+ProcessSequenceInitCommand(Port *myport, StringInfo message)
+{
+	GTM_SequenceKeyData seqkey;
+	GTM_Sequence increment, minval, maxval, startval;
+	bool cycle;
+	StringInfoData buf;
+	int errcode;
+	MemoryContext oldContext;
+
+	/*
+	 * Get the sequence key
+	 */
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	/*
+	 * Read various sequence parameters
+	 */
+	memcpy(&increment, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+		   sizeof (GTM_Sequence));
+	memcpy(&minval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+		   sizeof (GTM_Sequence));
+	memcpy(&maxval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+		   sizeof (GTM_Sequence));
+	memcpy(&startval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+		   sizeof (GTM_Sequence));
+	
+	cycle = pq_getmsgbyte(message);
+
+
+	/*
+	 * We must use the TopMostMemoryContext because the sequence information is
+	 * not bound to a thread and can outlive any of the thread specific
+	 * contextes.
+	 */
+	oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+	if (GTM_SeqOpen(&seqkey, increment, minval, maxval, startval, cycle))
+		ereport(ERROR,
+				(errcode,
+				 errmsg("Failed to open a new sequence")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_getmsgend(message);
+
+	/*
+	 * Send a SUCCESS message back to the client
+	 */
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SEQUENCE_INIT_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendint(&buf, seqkey.gsk_keylen, 4);
+	pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_GET_CURRENT message
+ */
+void
+ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message)
+{
+	GTM_SequenceKeyData seqkey;
+	StringInfoData buf;
+	GTM_Sequence seqval;
+
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	seqval = GTM_SeqGetCurrent(&seqkey);
+	if (!SEQVAL_IS_VALID(seqval))
+		ereport(ERROR,
+				(ERANGE,
+				 errmsg("Can not get current value of the sequence")));
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendint(&buf, seqkey.gsk_keylen, 4);
+	pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+	pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_GET_NEXT message
+ */
+void
+ProcessSequenceGetNextCommand(Port *myport, StringInfo message)
+{
+	GTM_SequenceKeyData seqkey;
+	StringInfoData buf;
+	GTM_Sequence seqval;
+
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	seqval = GTM_SeqGetNext(&seqkey);
+	if (!SEQVAL_IS_VALID(seqval))
+		ereport(ERROR,
+				(ERANGE,
+				 errmsg("Can not get current value of the sequence")));
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendint(&buf, seqkey.gsk_keylen, 4);
+	pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+	pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_RESET message
+ */
+void
+ProcessSequenceResetCommand(Port *myport, StringInfo message)
+{
+	GTM_SequenceKeyData seqkey;
+	StringInfoData buf;
+	int errcode;
+
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	if ((errcode = GTM_SeqReset(&seqkey)))
+		ereport(ERROR,
+				(errcode,
+				 errmsg("Can not reset the sequence")));
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendint(&buf, seqkey.gsk_keylen, 4);
+	pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+}
+
+/*
+ * Process MSG_SEQUENCE_CLOSE message
+ */
+void
+ProcessSequenceCloseCommand(Port *myport, StringInfo message)
+{
+	GTM_SequenceKeyData seqkey;
+	StringInfoData buf;
+	int errcode;
+
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	if ((errcode = GTM_SeqClose(&seqkey)))
+		ereport(ERROR,
+				(errcode,
+				 errmsg("Can not close the sequence")));
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendint(&buf, seqkey.gsk_keylen, 4);
+	pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+}
+
+void
+GTM_SaveSeqInfo(int ctlfd)
+{
+	GTM_SeqInfoHashBucket *bucket;
+	ListCell *elem;
+	GTM_SeqInfo *seqinfo = NULL;
+	int hash;
+
+	for (hash = 0; hash < SEQ_HASH_TABLE_SIZE; hash++)
+	{
+		bucket = &GTMSequences[hash];
+
+		GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ);
+
+		foreach(elem, bucket->shb_list)
+		{
+			seqinfo = (GTM_SeqInfo *) lfirst(elem);
+			if (seqinfo == NULL)
+				break;
+
+			if (seqinfo->gs_state == SEQ_STATE_DELETED)
+				continue;
+
+			GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ);
+
+			write(ctlfd, &SeqStartMagic, sizeof (SeqStartMagic));
+			write(ctlfd, &seqinfo->gs_key->gsk_keylen, sizeof (uint32));
+			write(ctlfd, seqinfo->gs_key->gsk_key, seqinfo->gs_key->gsk_keylen);
+			write(ctlfd, &seqinfo->gs_value, sizeof (GTM_Sequence));
+			write(ctlfd, &seqinfo->gs_init_value, sizeof (GTM_Sequence));
+			write(ctlfd, &seqinfo->gs_increment_by, sizeof (GTM_Sequence));
+			write(ctlfd, &seqinfo->gs_min_value, sizeof (GTM_Sequence));
+			write(ctlfd, &seqinfo->gs_max_value, sizeof (GTM_Sequence));
+			write(ctlfd, &seqinfo->gs_cycle, sizeof (bool));
+			write(ctlfd, &seqinfo->gs_called, sizeof (bool));
+			write(ctlfd, &seqinfo->gs_state, sizeof (int32));
+			write(ctlfd, &SeqEndMagic, sizeof(SeqEndMagic));
+
+			GTM_RWLockRelease(&seqinfo->gs_lock);
+		}
+
+		GTM_RWLockRelease(&bucket->shb_lock);
+	}
+
+}
+
+void
+GTM_RestoreSeqInfo(int ctlfd)
+{
+	int magic;
+
+	if (ctlfd == -1)
+		return;
+
+	while (read(ctlfd, &magic, sizeof (SeqStartMagic)) == sizeof (SeqStartMagic))
+	{
+		GTM_SequenceKeyData seqkey;
+		GTM_Sequence increment_by;
+		GTM_Sequence minval;
+		GTM_Sequence maxval;
+		GTM_Sequence startval;
+		GTM_Sequence curval;
+		int32 state;
+		bool cycle;
+		bool called;
+
+		if (magic != SeqStartMagic)
+		{
+			elog(LOG, "Start magic mismatch %x - %x", magic, SeqStartMagic);
+			break;
+		}
+
+		if (read(ctlfd, &seqkey.gsk_keylen, sizeof (uint32)) != sizeof (uint32))
+		{
+			elog(LOG, "Failed to read keylen");
+			break;
+		}
+		
+		seqkey.gsk_key = palloc(seqkey.gsk_keylen);
+		read(ctlfd, seqkey.gsk_key, seqkey.gsk_keylen);
+
+		read(ctlfd, &curval, sizeof (GTM_Sequence));
+		read(ctlfd, &startval, sizeof (GTM_Sequence));
+		read(ctlfd, &increment_by, sizeof (GTM_Sequence));
+		read(ctlfd, &minval, sizeof (GTM_Sequence));
+		read(ctlfd, &maxval, sizeof (GTM_Sequence));
+		read(ctlfd, &cycle, sizeof (bool));
+		read(ctlfd, &called, sizeof (bool));
+		read(ctlfd, &state, sizeof (int32));
+		read(ctlfd, &magic, sizeof(SeqEndMagic));
+
+		if (magic != SeqEndMagic)
+		{
+			elog(WARNING, "Corrupted control file");
+			return;
+		}
+
+		GTM_SeqRestore(&seqkey, increment_by, minval, maxval, startval, curval,
+				state, cycle, called);
+	}
+}
diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c
new file mode 100644
index 0000000000..5c9b4b2ae5
--- /dev/null
+++ b/src/gtm/main/gtm_snap.c
@@ -0,0 +1,466 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_snap.c
+ *	Snapshot handling on GTM
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/elog.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/assert.h"
+#include "gtm/stringinfo.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+
+
+/*
+ * Get snapshot for the given transactions. If this is the first call in the
+ * transaction, a fresh snapshot is taken and returned back. For a serializable
+ * transaction, repeated calls to the function will return the same snapshot.
+ * For a read-committed transaction, fresh snapshot is taken every time and
+ * returned to the caller.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax.  It is used as follows:
+ *		All xact IDs < xmin are considered finished.
+ *		All xact IDs >= xmax are considered still running.
+ *		For an xact ID xmin <= xid < xmax, consult list to see whether
+ *		it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot.
+ *
+ * We also update the following global variables:
+ *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
+ *			running transactions
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+GTM_Snapshot
+GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], int txn_count, int *status)
+{
+	GlobalTransactionId xmin;
+	GlobalTransactionId xmax;
+	GlobalTransactionId globalxmin;
+	int			count = 0;
+	ListCell *elem = NULL;
+	int ii;
+
+	/*
+	 * Instead of allocating memory for a snapshot, we use the snapshot of the
+	 * first transaction in the given array. The same snapshot will later be
+	 * copied to other transaction info structures.
+	 */
+	GTM_TransactionInfo *mygtm_txninfo = NULL;
+	GTM_Snapshot snapshot = NULL;
+
+	memset(status, 0, sizeof (int) * txn_count);
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+
+		/*
+		 * If the transaction does not exist, just mark the status field with
+		 * a STATUS_ERROR code
+		 */
+		if (mygtm_txninfo == NULL)
+			status[ii] = STATUS_ERROR;
+		else if (snapshot == NULL)
+			snapshot = &mygtm_txninfo->gti_current_snapshot;
+	}
+
+	/*
+	 * If no valid transaction exists in the array, send an error message back.
+	 * Otherwise, we should still get the snapshot and send it back. The
+	 * invalid transaction ids are marked separately in the status array.
+	 */
+	if (snapshot == NULL)
+		return NULL;
+
+	Assert(snapshot != NULL);
+
+	if (snapshot->sn_xip == NULL)
+	{
+		/*
+		 * First call for this snapshot
+		 */
+		snapshot->sn_xip = (GlobalTransactionId *)
+			palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+		if (snapshot->sn_xip == NULL)
+			ereport(ERROR,
+					(ENOMEM,
+					 errmsg("out of memory")));
+	}
+
+	/*
+	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
+	 * going to set MyProc->xmin.
+	 */
+	GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ);
+
+	/* xmax is always latestCompletedXid + 1 */
+	xmax = GTMTransactions.gt_latestCompletedXid;
+	Assert(GlobalTransactionIdIsNormal(xmax));
+	GlobalTransactionIdAdvance(xmax);
+
+	/* initialize xmin calculation with xmax */
+	globalxmin = xmin = xmax;
+
+	/*
+	 * Spin over transaction list checking xid, xmin, and subxids.  The goal is to
+	 * gather all active xids and find the lowest xmin
+	 */
+	foreach(elem, GTMTransactions.gt_open_transactions) 
+	{
+		volatile GTM_TransactionInfo *gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem);
+		GlobalTransactionId xid;
+
+		/* Don't take into account LAZY VACUUMs */
+		if (gtm_txninfo->gti_vacuum)
+			continue;
+
+		/* Update globalxmin to be the smallest valid xmin */
+		xid = gtm_txninfo->gti_xmin;		/* fetch just once */
+		if (GlobalTransactionIdIsNormal(xid) &&
+			GlobalTransactionIdPrecedes(xid, globalxmin))
+			globalxmin = xid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = gtm_txninfo->gti_gxid;
+
+		/*
+		 * If the transaction has been assigned an xid < xmax we add it to the
+		 * snapshot, and update xmin if necessary.	There's no need to store
+		 * XIDs >= xmax, since we'll treat them as running anyway.  We don't
+		 * bother to examine their subxids either.
+		 *
+		 * We don't include our own XID (if any) in the snapshot, but we must
+		 * include it into xmin.
+		 */
+		if (GlobalTransactionIdIsNormal(xid))
+		{
+			/*
+			 * Unlike Postgres, we include the GXID of the current transaction
+			 * as well in the snapshot. This is necessary because the same
+			 * snapshot is shared by multiple backends through GTM proxy and
+			 * the GXID will vary for each backend.
+			 *
+			 * XXX We should confirm that this does not have any adverse effect
+			 * on the MVCC visibility and check if any changes are related to
+			 * the MVCC checks because of the change
+			 */
+			if (GlobalTransactionIdFollowsOrEquals(xid, xmax))
+				continue;
+			if (GlobalTransactionIdPrecedes(xid, xmin))
+				xmin = xid;
+			snapshot->sn_xip[count++] = xid;
+		}
+	}
+
+	/*
+	 * Update globalxmin to include actual process xids.  This is a slightly
+	 * different way of computing it than GetOldestXmin uses, but should give
+	 * the same result.
+	 */
+	if (GlobalTransactionIdPrecedes(xmin, globalxmin))
+		globalxmin = xmin;
+
+	GTMTransactions.gt_recent_global_xmin = globalxmin;
+
+	snapshot->sn_xmin = xmin;
+	snapshot->sn_xmax = xmax;
+	snapshot->sn_xcnt = count;
+	snapshot->sn_recent_global_xmin = globalxmin;
+
+	/*
+	 * Now, before the proc array lock is released, set the xmin in the txninfo
+	 * structures of all the transactions.
+	 */
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		GTM_Snapshot mysnap = NULL;
+
+		/*
+		 * We have already gone through all the transaction handles above and
+		 * marked the invalid handles with STATUS_ERROR
+		 */
+		if (status[ii] == STATUS_ERROR)
+			continue;
+
+		mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+		mysnap = &mygtm_txninfo->gti_current_snapshot;
+		
+		if (GTM_IsTransSerializable(mygtm_txninfo))
+		{
+			if ((mygtm_txninfo->gti_snapshot_set) && (txn_count > 1))
+				elog(ERROR, "Grouped snapshot can only include first snapshot in Serializable transaction");
+
+			if (!mygtm_txninfo->gti_snapshot_set)
+			{
+				/*
+				 * For the first transaction in the array, the snapshot is
+				 * already set.
+				 */
+				if (snapshot != mysnap)
+				{
+					if (mysnap->sn_xip == NULL)
+					{
+						/*
+						 * First call for this snapshot
+						 */
+						mysnap->sn_xip = (GlobalTransactionId *)
+							palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+						if (mysnap->sn_xip == NULL)
+							ereport(ERROR, (ENOMEM, errmsg("out of memory")));
+					}
+					mysnap->sn_xmin = snapshot->sn_xmin;
+					mysnap->sn_xmax = snapshot->sn_xmax;
+					mysnap->sn_xcnt = snapshot->sn_xcnt;
+					mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin;
+					memcpy(mysnap->sn_xip, snapshot->sn_xip,
+							sizeof (GlobalTransactionId) * snapshot->sn_xcnt);
+				}
+				mygtm_txninfo->gti_snapshot_set = true;
+			}
+		}
+		else if (snapshot != mysnap)
+		{
+			if (mysnap->sn_xip == NULL)
+			{
+				/*
+				 * First call for this snapshot
+				 */
+				mysnap->sn_xip = (GlobalTransactionId *)
+					palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId));
+				if (mysnap->sn_xip == NULL)
+					ereport(ERROR, (ENOMEM, errmsg("out of memory")));
+			}
+			mysnap->sn_xmin = snapshot->sn_xmin;
+			mysnap->sn_xmax = snapshot->sn_xmax;
+			mysnap->sn_xcnt = snapshot->sn_xcnt;
+			mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin;
+			memcpy(mysnap->sn_xip, snapshot->sn_xip,
+					sizeof (GlobalTransactionId) * snapshot->sn_xcnt);
+		}
+
+		if ((mygtm_txninfo != NULL) &&
+			(!GlobalTransactionIdIsValid(mygtm_txninfo->gti_xmin)))
+			mygtm_txninfo->gti_xmin = xmin;
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+	elog(DEBUG1, "GTM_GetTransactionSnapshot: (%u:%u:%u:%u)",
+			snapshot->sn_xmin, snapshot->sn_xmax,
+			snapshot->sn_xcnt, snapshot->sn_recent_global_xmin);
+	return snapshot;
+}
+
+/*
+ * Process MSG_SNAPSHOT_GET command
+ */
+void
+ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	int isgxid = 0;
+	GTM_Snapshot snapshot;
+	MemoryContext oldContext;
+	bool canbe_grouped;
+	int status;
+	int txn_count = 1;
+
+	/*
+	 * This is used by the GTM proxy to decide whether to group this snapshot
+	 * request with some other snapshot request from some other backend.
+	 *
+	 * This is mostly useless for the GTM server.
+	 */ 
+	canbe_grouped = pq_getmsgbyte(message);
+
+	isgxid = pq_getmsgbyte(message);
+
+	if (isgxid)
+	{
+		const char *data = NULL;
+		Assert(!get_gxid);
+		data = pq_getmsgbytes(message, sizeof (gxid));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid GXID")));
+		memcpy(&gxid, data, sizeof (gxid));
+		txn = GTM_GXIDToHandle(gxid);
+	}
+	else
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (txn));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid Transaction Handle")));
+		memcpy(&txn, data, sizeof (txn));
+	}
+	pq_getmsgend(message);
+
+	if (get_gxid)
+	{
+		Assert(!isgxid);
+		gxid = GTM_GetGlobalTransactionId(txn);
+		if (gxid == InvalidGlobalTransactionId)
+			ereport(ERROR,
+					(EINVAL,
+					 errmsg("Failed to get a new transaction id")));
+	}
+
+	oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+	/*
+	 * Get a fresh snapshot
+	 */
+	if ((snapshot = GTM_GetTransactionSnapshot(&txn, 1, &status)) == NULL)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get a snapshot")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, get_gxid ? SNAPSHOT_GXID_GET_RESULT : SNAPSHOT_GET_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId));
+	pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+	pq_sendbytes(&buf, (char *)&status, sizeof(int) * txn_count);
+	pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
+	pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
+	pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId));
+	pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int));
+	pq_sendbytes(&buf, (char *)snapshot->sn_xip,
+				 sizeof(GlobalTransactionId) * snapshot->sn_xcnt);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+
+	return;
+}
+
+/*
+ * Process MSG_SNAPSHOT_GET_MULTI command
+ */
+void
+ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+	GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	GTM_Snapshot snapshot;
+	MemoryContext oldContext;
+	int txn_count;
+	int ii;
+	int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+
+	txn_count = pq_getmsgint(message, sizeof (int));
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		isgxid[ii] = pq_getmsgbyte(message);
+		if (isgxid[ii])
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid GXID")));
+			memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+			txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+		}
+		else
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid Transaction Handle")));
+			memcpy(&txn[ii], data, sizeof (txn[ii]));
+		}
+	}
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+	/*
+	 * Get a fresh snapshot
+	 */
+	if ((snapshot = GTM_GetTransactionSnapshot(txn, txn_count, status)) == NULL)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get a snapshot")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, SNAPSHOT_GET_MULTI_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+	pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+	pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
+	pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
+	pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId));
+	pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int));
+	pq_sendbytes(&buf, (char *)snapshot->sn_xip,
+				 sizeof(GlobalTransactionId) * snapshot->sn_xcnt);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+
+	return;
+}
+
+/*
+ * Free the snapshot data. The snapshot itself is not freed though
+ */
+void
+GTM_FreeSnapshotData(GTM_Snapshot snapshot)
+{
+	if (snapshot == NULL)
+		return;
+
+	if (snapshot->sn_xip != NULL)
+	{
+		Assert(snapshot->sn_xcnt);
+		pfree(snapshot->sn_xip);
+		snapshot->sn_xip = NULL;
+	}
+}
diff --git a/src/gtm/main/gtm_stat.c b/src/gtm/main/gtm_stat.c
new file mode 100644
index 0000000000..fac6b64c24
--- /dev/null
+++ b/src/gtm/main/gtm_stat.c
@@ -0,0 +1,37 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stat.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+
+uint32	GTM_Message_Stats[MSG_MAX_MESSAGE_TYPE];
+uint32	GTM_Result_Stats[GTM_MAX_RESULT_TYPE];
+
+void
+gtm_msgstat_increment(int type)
+{
+	GTM_Message_Stats[type]++;
+}
+
+void
+gtm_resultstat_increment(int type)
+{
+	GTM_Result_Stats[type]++;
+}
+
+void
+gtm_print_stats(void)
+{
+
+}
diff --git a/src/gtm/main/gtm_stats.c b/src/gtm/main/gtm_stats.c
new file mode 100644
index 0000000000..aba1a219fb
--- /dev/null
+++ b/src/gtm/main/gtm_stats.c
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stats.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+typedef struct GTM_Stats
+{
+	int 	GTM_RecvMessages[GTM_MAX_MESSAGE_TYPE];
+	int 	GTM_SentMessages[GTM_MAX_MESSAGE_TYPE];
+	float	GTM_RecvBytes;
+	float	GTM_SentBytes;
+} GTM_Stats;
+
+
diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c
new file mode 100644
index 0000000000..61ea640ab5
--- /dev/null
+++ b/src/gtm/main/gtm_thread.c
@@ -0,0 +1,336 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_thread.c
+ *	Thread handling
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <pthread.h>
+#include "gtm/gtm.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/libpq.h"
+
+static void *GTM_ThreadMainWrapper(void *argp);
+static void GTM_ThreadCleanup(void *argp);
+
+GTM_Threads	GTMThreadsData;
+GTM_Threads *GTMThreads = &GTMThreadsData;
+
+#define GTM_MIN_THREADS 32			/* Provision for minimum threads */
+#define GTM_MAX_THREADS 1024		/* Max threads allowed in the GTM */
+#define GTMThreadsFull	(GTMThreads->gt_thread_count == GTMThreads->gt_array_size)	
+
+/*
+ * Add the given thrinfo structure to the global array, expanding it if
+ * necessary
+ */
+int
+GTM_ThreadAdd(GTM_ThreadInfo *thrinfo)
+{
+	int ii;
+
+	GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+	if (GTMThreadsFull)
+	{
+		uint32 newsize;
+	   
+		/*
+		 * TODO Optimize lock management by not holding any locks during memory
+		 * allocation 
+		 */
+		if (GTMThreads->gt_array_size == GTM_MAX_THREADS)
+			elog(ERROR, "Too many threads active");
+
+		if (GTMThreads->gt_array_size == 0)
+			newsize = GTM_MIN_THREADS;
+		else
+		{
+			/*
+			 * We ran out of the array size. Just double the size, bound by the
+			 * upper limit
+			 */
+			newsize = GTMThreads->gt_array_size * 2;
+		}
+
+		/* Can't have more than GTM_MAX_THREADS */
+		if (newsize > GTM_MAX_THREADS)
+			newsize = GTM_MAX_THREADS;
+
+		if (GTMThreads->gt_threads == NULL)
+			GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize);
+		else
+		{
+			void *old_ptr = GTMThreads->gt_threads;
+			GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize);
+			memcpy(GTMThreads->gt_threads, old_ptr,
+					GTMThreads->gt_array_size * sizeof (GTM_ThreadInfo *));
+			pfree(old_ptr);
+		}
+
+		GTMThreads->gt_array_size = newsize;
+	}
+
+	/*
+	 * Now that we have free entries in the array, find a free slot and add the
+	 * thrinfo pointer to it.
+	 *
+	 * TODO Optimize this later by tracking few free slots and reusing them.
+	 * The free slots can be updated when a thread exits and reused when a new
+	 * thread is added to the pool.
+	 */
+	for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+	{
+		if (GTMThreads->gt_threads[ii] == NULL)
+		{
+			GTMThreads->gt_threads[ii] = thrinfo;
+			GTMThreads->gt_thread_count++;
+			break;
+		}
+	}
+	GTM_RWLockRelease(&GTMThreads->gt_lock);
+
+	/* 
+	 * Track the slot information in the thrinfo. This is useful to quickly
+	 * find the slot given the thrinfo structure.
+	 */
+	thrinfo->thr_localid = ii;
+	return ii;
+}
+
+int
+GTM_ThreadRemove(GTM_ThreadInfo *thrinfo)
+{
+	int ii;
+	GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+	for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+	{
+		if (GTMThreads->gt_threads[ii] == thrinfo)
+			break;
+	}
+
+	if (ii == GTMThreads->gt_array_size)
+		elog(ERROR, "Thread (%p) not found ", thrinfo);
+
+	GTMThreads->gt_threads[ii] = NULL;
+	GTMThreads->gt_thread_count--;
+	GTM_RWLockRelease(&GTMThreads->gt_lock);
+
+	pfree(thrinfo);
+
+	return 0;
+}
+
+/*
+ * Create a new thread and assign the given connection to it.
+ *
+ * This function is responsible for setting up the various memory contextes for
+ * the thread as well as registering this thread with the Thread Manager.
+ *
+ * Upon successful creation, the thread will start running the given
+ * "startroutine". The thread information is returned to the calling process.
+ */
+GTM_ThreadInfo *
+GTM_ThreadCreate(GTM_ConnectionInfo *conninfo,
+				  void *(* startroutine)(void *))
+{
+	GTM_ThreadInfo *thrinfo;
+	int err;
+
+	/*
+	 * We are still running in the context of the main thread. So the
+	 * allocation below would last as long as the main thread exists or the
+	 * memory is explicitely freed.
+	 */
+	thrinfo = (GTM_ThreadInfo *)palloc0(sizeof (GTM_ThreadInfo));
+
+	thrinfo->thr_conn = conninfo;
+	GTM_RWLockInit(&thrinfo->thr_lock);
+
+	/*
+	 * The thread status is set to GTM_THREAD_STARTING and will be changed by
+	 * the thread itself when it actually starts executing
+	 */
+	thrinfo->thr_status = GTM_THREAD_STARTING;
+
+	/*
+	 * Install the ThreadInfo structure in the global array. We do this before
+	 * starting the thread
+	 */
+	if (GTM_ThreadAdd(thrinfo) == -1)
+		elog(ERROR, "Error starting a new thread");
+
+	/*
+	 * Set up memory contextes before actually starting the threads
+	 *
+	 * The TopThreadContext is a child of TopMemoryContext and it will last as
+	 * long as the main process or this thread lives
+	 *
+	 * Thread context is not shared between other threads
+	 */
+	thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext,
+														"TopMemoryContext",
+														ALLOCSET_DEFAULT_MINSIZE,
+														ALLOCSET_DEFAULT_INITSIZE,
+														ALLOCSET_DEFAULT_MAXSIZE,
+														false);
+
+	/*
+	 * Since the thread is not yes started, TopMemoryContext still points to
+	 * the context of the calling thread
+	 */
+	thrinfo->thr_parent_context = TopMemoryContext;
+
+	/*
+	 * Each thread gets its own ErrorContext and its a child of ErrorContext of
+	 * the main process
+	 *
+	 * This is a thread-specific context and is not shared between other
+	 * threads
+	 */
+	thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
+													   "ErrorContext",
+													   8 * 1024,
+													   8 * 1024,
+													   8 * 1024,
+													   false);
+
+	thrinfo->thr_startroutine = startroutine;
+
+	/*
+	 * Now start the thread. The thread will start executing the given
+	 * "startroutine". The thrinfo structure is also passed to the thread. Any
+	 * additional parameters should be passed via the thrinfo strcuture.
+	 *
+	 * Return the thrinfo structure to the caller
+	 */
+	if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper,
+							 thrinfo)))
+		ereport(ERROR,
+				(err,
+				 errmsg("Failed to create a new thread: error %d", err)));
+
+	return thrinfo;
+}
+
+/*
+ * Exit the current thread
+ */
+void
+GTM_ThreadExit(void)
+{
+	/* XXX To be implemented */
+}
+
+int
+GTM_ThreadJoin(GTM_ThreadInfo *thrinfo)
+{
+	int error;
+	void *data;
+
+	error = pthread_join(thrinfo->thr_id, &data);
+
+	return error;
+}
+
+/*
+ * Get thread information for the given thread, identified by the
+ * thread_id
+ */
+GTM_ThreadInfo *
+GTM_GetThreadInfo(GTM_ThreadID thrid)
+{
+
+	return NULL;
+}
+
+/*
+ * Cleanup routine for the thread
+ */
+static void
+GTM_ThreadCleanup(void *argp)
+{
+	GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+
+	elog(LOG, "Cleaning up thread state");
+
+	/*
+	 * TODO Close the open connection.
+	 */
+	StreamClose(thrinfo->thr_conn->con_port->sock);
+
+	/* Free the port */
+	ConnFree(thrinfo->thr_conn->con_port);
+	thrinfo->thr_conn->con_port = NULL;
+
+	/* Free the connection info structure */
+	pfree(thrinfo->thr_conn);
+	thrinfo->thr_conn = NULL;
+
+	/*
+	 * Switch to the memory context of the main process so that we can free up
+	 * our memory contextes easily.
+	 *
+	 * XXX We don't setup cleanup handlers for the main process. So this
+	 * routine would never be called for the main process/thread
+	 */
+	MemoryContextSwitchTo(thrinfo->thr_parent_context);
+
+	MemoryContextDelete(thrinfo->thr_message_context);
+	thrinfo->thr_message_context = NULL;
+
+	MemoryContextDelete(thrinfo->thr_error_context);
+	thrinfo->thr_error_context = NULL;
+
+	MemoryContextDelete(thrinfo->thr_thread_context);
+	thrinfo->thr_thread_context = NULL;
+
+	/*
+	 * TODO Now cleanup the thrinfo structure itself and remove it from the global
+	 * array.
+	 */
+	GTM_ThreadRemove(thrinfo);
+
+	/*
+	 * Reset the thread-specific information. This should be done only after we
+	 * are sure that memory contextes are not required 
+	 *
+	 * Note: elog calls need memory contextes, so no elog calls beyond this
+	 * point.
+	 */
+	SetMyThreadInfo(NULL);
+	
+	return;
+}
+
+/*
+ * A wrapper around the start routine of the thread. This helps us doing any
+ * initialization and setting up cleanup handlers before the main routine is
+ * started
+ */
+void *
+GTM_ThreadMainWrapper(void *argp)
+{
+	GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+
+	pthread_detach(thrinfo->thr_id);
+
+	SetMyThreadInfo(thrinfo);
+	MemoryContextSwitchTo(TopMemoryContext);
+	
+	pthread_cleanup_push(GTM_ThreadCleanup, thrinfo);
+	thrinfo->thr_startroutine(thrinfo);
+	pthread_cleanup_pop(1);
+
+	return thrinfo;
+}
diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c
new file mode 100644
index 0000000000..6090ae10fb
--- /dev/null
+++ b/src/gtm/main/gtm_txn.c
@@ -0,0 +1,1521 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_txn.c
+ *	Transaction handling
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+#include "gtm/elog.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/assert.h"
+#include "gtm/stringinfo.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+#include <unistd.h>
+
+/* Local functions */
+static XidStatus GlobalTransactionIdGetStatus(GlobalTransactionId transactionId);
+static bool GTM_SetDoVacuum(GTM_TransactionHandle handle);
+
+GTM_Transactions GTMTransactions;
+
+void
+GTM_InitTxnManager(void)
+{
+	int ii;
+
+	memset(&GTMTransactions, 0, sizeof (GTM_Transactions));
+
+	for (ii = 0; ii < GTM_MAX_GLOBAL_TRANSACTIONS; ii++)
+	{
+		GTM_TransactionInfo *gtm_txninfo = &GTMTransactions.gt_transactions_array[ii];
+		gtm_txninfo->gti_in_use = false;
+		GTM_RWLockInit(&gtm_txninfo->gti_lock);
+	}
+
+	/*
+	 * XXX When GTM is stopped and restarted, it must start assinging GXIDs
+	 * greater than the previously assgined values. If it was a clean shutdown,
+	 * the GTM can store the last assigned value at a known location on
+	 * permanent storage and read it back when it's restarted. It will get
+	 * trickier for GTM failures.
+	 *
+	 * TODO We skip thia part for the prototype.
+	 */ 
+	GTMTransactions.gt_nextXid = FirstNormalGlobalTransactionId;
+
+	/*
+	 * XXX The gt_oldestXid is the cluster level oldest Xid
+	 */
+	GTMTransactions.gt_oldestXid = FirstNormalGlobalTransactionId;
+	
+	/*
+	 * XXX Compute various xid limits to avoid wrap-around related database
+	 * corruptions. Again, this is not implemeneted for the prototype
+	 */
+	GTMTransactions.gt_xidVacLimit = InvalidGlobalTransactionId;
+	GTMTransactions.gt_xidWarnLimit = InvalidGlobalTransactionId;
+	GTMTransactions.gt_xidStopLimit = InvalidGlobalTransactionId;
+	GTMTransactions.gt_xidWrapLimit = InvalidGlobalTransactionId;
+
+	/*
+	 * XXX Newest XID that is committed or aborted
+	 */
+	GTMTransactions.gt_latestCompletedXid = FirstNormalGlobalTransactionId;
+	
+	/*
+	 * Initialize the locks to protect various XID fields as well as the linked
+	 * list of transactions
+	 */
+	GTM_RWLockInit(&GTMTransactions.gt_XidGenLock);
+	GTM_RWLockInit(&GTMTransactions.gt_TransArrayLock);
+
+	/*
+	 * Initialize the list
+	 */
+	GTMTransactions.gt_open_transactions = NIL;
+	GTMTransactions.gt_lastslot = -1;
+
+	GTMTransactions.gt_gtm_state = GTM_STARTING;
+
+	return;
+}
+
+/*
+ * Get the status of current or past transaction.
+ */
+static XidStatus
+GlobalTransactionIdGetStatus(GlobalTransactionId transactionId)
+{
+	XidStatus	xidstatus;
+
+	/*
+	 * Also, check to see if the transaction ID is a permanent one.
+	 */
+	if (!GlobalTransactionIdIsNormal(transactionId))
+	{
+		if (GlobalTransactionIdEquals(transactionId, BootstrapGlobalTransactionId))
+			return TRANSACTION_STATUS_COMMITTED;
+		if (GlobalTransactionIdEquals(transactionId, FrozenGlobalTransactionId))
+			return TRANSACTION_STATUS_COMMITTED;
+		return TRANSACTION_STATUS_ABORTED;
+	}
+
+	/*
+	 * TODO To be implemeneted
+	 */
+	return xidstatus;
+}
+
+/*
+ * Given the GXID, find the corresponding transaction handle.
+ */
+GTM_TransactionHandle
+GTM_GXIDToHandle(GlobalTransactionId gxid)
+{
+	ListCell *elem = NULL;
+   	GTM_TransactionInfo *gtm_txninfo = NULL;
+
+	GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ);
+
+	foreach(elem, GTMTransactions.gt_open_transactions)
+	{
+		gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem);
+		if (GlobalTransactionIdEquals(gtm_txninfo->gti_gxid, gxid))
+			break;
+		gtm_txninfo = NULL;
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+	if (gtm_txninfo != NULL)
+		return gtm_txninfo->gti_handle;
+	else
+		return InvalidTransactionHandle;
+}
+
+/*
+ * Given the transaction handle, find the corresponding transaction info
+ * structure
+ *
+ * Note: Since a transaction handle is just an index into the global array,
+ * this function should be very quick. We should turn into an inline future for
+ * fast path.
+ */
+GTM_TransactionInfo *
+GTM_HandleToTransactionInfo(GTM_TransactionHandle handle)
+{
+   	GTM_TransactionInfo *gtm_txninfo = NULL;
+
+	if ((handle < 0) || (handle > GTM_MAX_GLOBAL_TRANSACTIONS))
+	{
+		ereport(WARNING,
+				(ERANGE, errmsg("Invalid transaction handle: %d", handle)));
+		return NULL;
+	}
+
+	gtm_txninfo = &GTMTransactions.gt_transactions_array[handle];
+
+	if (!gtm_txninfo->gti_in_use)
+	{
+		ereport(WARNING,
+				(ERANGE, errmsg("Invalid transaction handle, txn_info not in use")));
+		return NULL;
+	}
+
+	return gtm_txninfo;
+}
+
+/*
+ * Remove the given transaction info structures from the global array. If the
+ * calling thread does not have enough cached structures, we in fact keep the
+ * structure in the global array and also add it to the list of cached
+ * structures for this thread. This ensures that the next transaction starting
+ * in this thread can quickly get a free slot in the array of transactions and
+ * also avoid repeated malloc/free of the structures.
+ *
+ * Also compute the latestCompletedXid.
+ */
+static void
+GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count)
+{
+	int ii;
+
+	/*
+	 * Remove the transaction structure from the global list of open
+	 * transactions
+	 */
+	GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		if (gtm_txninfo[ii] == NULL)
+			continue;
+
+		GTMTransactions.gt_open_transactions = list_delete(GTMTransactions.gt_open_transactions, gtm_txninfo[ii]);
+
+		if (GlobalTransactionIdIsNormal(gtm_txninfo[ii]->gti_gxid) &&
+			GlobalTransactionIdFollowsOrEquals(gtm_txninfo[ii]->gti_gxid,
+											   GTMTransactions.gt_latestCompletedXid))
+			GTMTransactions.gt_latestCompletedXid = gtm_txninfo[ii]->gti_gxid;
+
+
+		elog(DEBUG1, "GTM_RemoveTransInfoMulti: removing transaction id %u, %lu",
+				gtm_txninfo[ii]->gti_gxid, gtm_txninfo[ii]->gti_thread_id);
+		/*
+		 * Now mark the transaction as aborted and mark the structure as not-in-use
+		 */
+		gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED;
+		gtm_txninfo[ii]->gti_nodecount = 0;
+		gtm_txninfo[ii]->gti_in_use = false;
+		gtm_txninfo[ii]->gti_snapshot_set = false;
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+	return;
+}
+
+/*
+ * Remove all transaction infos associated with the caller thread and the given
+ * backend
+ *
+ * Also compute the latestCompletedXid.
+ */
+void
+GTM_RemoveAllTransInfos(int backend_id)
+{
+	ListCell *cell, *prev;
+	GTM_ThreadID thread_id;
+
+	thread_id = pthread_self();
+	
+	/*
+	 * Scan the global list of open transactions
+	 */
+	GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+	prev = NULL;
+	cell = list_head(GTMTransactions.gt_open_transactions);
+	while (cell != NULL)
+	{
+		GTM_TransactionInfo *gtm_txninfo = lfirst(cell);
+		/* check if current entry is associated with the thread */
+		if ((gtm_txninfo->gti_in_use) &&
+			(gtm_txninfo->gti_thread_id == thread_id) &&
+			((gtm_txninfo->gti_backend_id == backend_id) || (backend_id == -1)))
+		{
+			/* remove the entry */
+			GTMTransactions.gt_open_transactions = list_delete_cell(GTMTransactions.gt_open_transactions, cell, prev);
+
+			/* update the latestComletedXid */
+			if (GlobalTransactionIdIsNormal(gtm_txninfo->gti_gxid) &&
+				GlobalTransactionIdFollowsOrEquals(gtm_txninfo->gti_gxid,
+												   GTMTransactions.gt_latestCompletedXid))
+				GTMTransactions.gt_latestCompletedXid = gtm_txninfo->gti_gxid;
+
+			elog(DEBUG1, "GTM_RemoveAllTransInfos: removing transaction id %u, %lu:%lu",
+					gtm_txninfo->gti_gxid, gtm_txninfo->gti_thread_id, thread_id);
+			/*
+			 * Now mark the transaction as aborted and mark the structure as not-in-use
+			 */
+			gtm_txninfo->gti_state = GTM_TXN_ABORTED;
+			gtm_txninfo->gti_nodecount = 0;
+			gtm_txninfo->gti_in_use = false;
+			gtm_txninfo->gti_snapshot_set = false;
+			
+			/* move to next cell in the list */
+			if (prev)
+				cell = lnext(prev);
+			else
+				cell = list_head(GTMTransactions.gt_open_transactions);
+		}
+		else
+		{
+			prev = cell;
+			cell = lnext(cell);
+		}
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+	return;
+}
+/*
+ * GlobalTransactionIdDidCommit
+ *		True iff transaction associated with the identifier did commit.
+ *
+ * Note:
+ *		Assumes transaction identifier is valid.
+ */
+bool							/* true if given transaction committed */
+GlobalTransactionIdDidCommit(GlobalTransactionId transactionId)
+{
+	XidStatus	xidstatus;
+
+	xidstatus = GlobalTransactionIdGetStatus(transactionId);
+
+	/*
+	 * If it's marked committed, it's committed.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		return true;
+
+	/*
+	 * It's not committed.
+	 */
+	return false;
+}
+
+/*
+ * GlobalTransactionIdDidAbort
+ *		True iff transaction associated with the identifier did abort.
+ *
+ * Note:
+ *		Assumes transaction identifier is valid.
+ */
+bool							/* true if given transaction aborted */
+GlobalTransactionIdDidAbort(GlobalTransactionId transactionId)
+{
+	XidStatus	xidstatus;
+
+	xidstatus = GlobalTransactionIdGetStatus(transactionId);
+
+	/*
+	 * If it's marked aborted, it's aborted.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		return true;
+
+	/*
+	 * It's not aborted.
+	 */
+	return false;
+}
+
+/*
+ * GlobalTransactionIdPrecedes --- is id1 logically < id2?
+ */
+bool
+GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+	/*
+	 * If either ID is a permanent XID then we can just do unsigned
+	 * comparison.	If both are normal, do a modulo-2^31 comparison.
+	 */
+	int32		diff;
+
+	if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+		return (id1 < id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff < 0);
+}
+
+/*
+ * GlobalTransactionIdPrecedesOrEquals --- is id1 logically <= id2?
+ */
+bool
+GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+	int32		diff;
+
+	if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+		return (id1 <= id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff <= 0);
+}
+
+/*
+ * GlobalTransactionIdFollows --- is id1 logically > id2?
+ */
+bool
+GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+	int32		diff;
+
+	if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+		return (id1 > id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff > 0);
+}
+
+/*
+ * GlobalTransactionIdFollowsOrEquals --- is id1 logically >= id2?
+ */
+bool
+GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2)
+{
+	int32		diff;
+
+	if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2))
+		return (id1 >= id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff >= 0);
+}
+
+
+/*
+ * Set that the transaction is doing vacuum
+ *
+ */
+static bool
+GTM_SetDoVacuum(GTM_TransactionHandle handle)
+{
+	GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(handle);
+
+	if (gtm_txninfo == NULL)
+		ereport(ERROR, (EINVAL, errmsg("Invalid transaction handle")));
+
+	gtm_txninfo->gti_vacuum = true;
+	return true;
+}
+
+/*
+ * Allocate the next XID for my new transaction
+ *
+ * The new XID is also stored into the transaction info structure of the given
+ * transaction before returning.
+ */
+GlobalTransactionId
+GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count)
+{
+	GlobalTransactionId xid, start_xid = InvalidGlobalTransactionId;
+	GTM_TransactionInfo *gtm_txninfo = NULL;
+	int ii;
+
+	GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+
+	if (GTMTransactions.gt_gtm_state == GTM_SHUTTING_DOWN)
+	{
+		GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+		ereport(ERROR, (EINVAL, errmsg("GTM shutting down -- can not issue new transaction ids")));
+		return InvalidGlobalTransactionId;
+	}
+
+
+	/*
+	 * If we are allocating the first XID of a new page of the commit log,
+	 * zero out that commit-log page before returning. We must do this while
+	 * holding XidGenLock, else another xact could acquire and commit a later
+	 * XID before we zero the page.  Fortunately, a page of the commit log
+	 * holds 32K or more transactions, so we don't have to do this very often.
+	 *
+	ExtendCLOG(xid);
+	 */
+
+	/*
+	 * Now advance the nextXid counter.  This must not happen until after we
+	 * have successfully completed ExtendCLOG() --- if that routine fails, we
+	 * want the next incoming transaction to try it again.	We cannot assign
+	 * more XIDs until there is CLOG space for them.
+	 */
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		xid = GTMTransactions.gt_nextXid;
+
+		if (!GlobalTransactionIdIsValid(start_xid))
+			start_xid = xid;
+
+		/*----------
+		 * Check to see if it's safe to assign another XID.  This protects against
+		 * catastrophic data loss due to XID wraparound.  The basic rules are:
+		 *
+		 * If we're past xidVacLimit, start trying to force autovacuum cycles.
+		 * If we're past xidWarnLimit, start issuing warnings.
+		 * If we're past xidStopLimit, refuse to execute transactions, unless
+		 * we are running in a standalone backend (which gives an escape hatch
+		 * to the DBA who somehow got past the earlier defenses).
+		 *
+		 * Test is coded to fall out as fast as possible during normal operation,
+		 * ie, when the vac limit is set and we haven't violated it.
+		 *----------
+		 */
+		if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidVacLimit) &&
+			GlobalTransactionIdIsValid(GTMTransactions.gt_xidVacLimit))
+		{
+			if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidStopLimit))
+			{
+				GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+				ereport(ERROR,
+						(ERANGE,
+						 errmsg("database is not accepting commands to avoid wraparound data loss in database ")));
+			}
+			else if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidWarnLimit))
+				ereport(WARNING,
+				(errmsg("database must be vacuumed within %u transactions",
+						GTMTransactions.gt_xidWrapLimit - xid)));
+		}
+
+		GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid);
+		gtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
+		Assert(gtm_txninfo);
+		gtm_txninfo->gti_gxid = xid;
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+
+	return start_xid;
+}
+
+/*
+ * Allocate the next XID for my new transaction
+ *
+ * The new XID is also stored into the transaction info structure of the given
+ * transaction before returning.
+ */
+GlobalTransactionId
+GTM_GetGlobalTransactionId(GTM_TransactionHandle handle)
+{
+	return GTM_GetGlobalTransactionIdMulti(&handle, 1);
+}
+
+/*
+ * Read nextXid but don't allocate it.
+ */
+GlobalTransactionId
+ReadNewGlobalTransactionId(void)
+{
+	GlobalTransactionId xid;
+
+	GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_READ);	
+	xid = GTMTransactions.gt_nextXid;
+	GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+
+	return xid;
+}
+
+/*
+ * Set the nextXid.
+ *
+ * The GXID is usually read from a control file and set when the GTM is
+ * started. When the GTM is finally shutdown, the next to-be-assigned GXID is
+ * stroed in the control file.
+ *
+ * XXX We don't yet handle any crash recovery. So if the GTM is shutdown 
+ */
+void
+SetNextGlobalTransactionId(GlobalTransactionId gxid)
+{
+	GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);	
+	GTMTransactions.gt_nextXid = gxid;
+	GTMTransactions.gt_gtm_state = GTM_RUNNING;
+	GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+	return;
+}
+
+
+/* Transaction Control */
+int
+GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id,
+					 GTM_IsolationLevel isolevel[],
+					 bool readonly[],
+					 GTMProxy_ConnID connid[],
+					 int txn_count,
+					 GTM_TransactionHandle txns[])
+{
+	GTM_TransactionInfo *gtm_txninfo[txn_count];
+	MemoryContext oldContext;
+	int kk;
+
+	memset(gtm_txninfo, 0, sizeof (gtm_txninfo));
+
+	/*
+	 * XXX We should allocate the transaction info structure in the
+	 * top-most memory context instead of a thread context. This is
+	 * necessary because the transaction may outlive the thread which
+	 * started the transaction. Also, since the structures are stored in
+	 * the global array, it's dangerous to free the structures themselves
+	 * without removing the corresponding references from the global array
+	 */
+	oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+	
+	for (kk = 0; kk < txn_count; kk++)
+	{
+		int ii, jj, startslot;
+
+		/*
+		 * We had no cached slots. Now find a free slot in the transation array
+		 * and store the transaction info structure there
+		 */
+		GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
+		startslot = GTMTransactions.gt_lastslot + 1;
+		if (startslot >= GTM_MAX_GLOBAL_TRANSACTIONS)
+			startslot = 0;
+
+		for (ii = startslot, jj = 0;
+			 jj < GTM_MAX_GLOBAL_TRANSACTIONS;
+			 ii = (ii + 1) % GTM_MAX_GLOBAL_TRANSACTIONS, jj++)
+		{
+			if (GTMTransactions.gt_transactions_array[ii].gti_in_use == false)
+			{
+				gtm_txninfo[kk] = &GTMTransactions.gt_transactions_array[ii];
+				break;
+			}
+
+			if (ii == GTMTransactions.gt_lastslot)
+			{
+				GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+				ereport(ERROR,
+						(ERANGE, errmsg("Max transaction limit reached")));
+			}
+		}
+
+
+		gtm_txninfo[kk]->gti_gxid = InvalidGlobalTransactionId;
+		gtm_txninfo[kk]->gti_xmin = InvalidGlobalTransactionId;
+		gtm_txninfo[kk]->gti_state = GTM_TXN_STARTING;
+		gtm_txninfo[kk]->gti_coordid = coord_id;
+
+		gtm_txninfo[kk]->gti_isolevel = isolevel[kk];
+		gtm_txninfo[kk]->gti_readonly = readonly[kk];
+		gtm_txninfo[kk]->gti_backend_id = connid[kk];
+		gtm_txninfo[kk]->gti_in_use = true;
+
+		gtm_txninfo[kk]->gti_handle = ii;
+		gtm_txninfo[kk]->gti_vacuum = false;
+		gtm_txninfo[kk]->gti_thread_id = pthread_self();
+		GTMTransactions.gt_lastslot = ii;		
+
+		txns[kk] = ii;
+
+		/*
+		 * Add the structure to the global list of open transactions. We should
+		 * call add the element to the list in the context of TopMostMemoryContext
+		 * because the list is global and any memory allocation must outlive the
+		 * thread context
+		 */
+		GTMTransactions.gt_open_transactions = lappend(GTMTransactions.gt_open_transactions, gtm_txninfo[kk]);
+	}
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return txn_count;
+}
+
+/* Transaction Control */
+GTM_TransactionHandle
+GTM_BeginTransaction(GTM_CoordinatorId coord_id,
+					 GTM_IsolationLevel isolevel,
+					 bool readonly)
+{
+	GTM_TransactionHandle txn;
+	GTMProxy_ConnID connid = -1;
+
+	GTM_BeginTransactionMulti(coord_id, &isolevel, &readonly, &connid, 1, &txn);
+	return txn;
+}
+
+/*
+ * Same as GTM_RollbackTransaction, but takes GXID as input
+ */
+int
+GTM_RollbackTransactionGXID(GlobalTransactionId gxid)
+{
+	GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+	return GTM_RollbackTransaction(txn);
+}
+
+/*
+ * Rollback multiple transactions in one go
+ */
+int
+GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[])
+{
+	GTM_TransactionInfo *gtm_txninfo[txn_count];
+	int ii;
+   
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]);
+
+		if (gtm_txninfo[ii] == NULL)
+		{
+			status[ii] = STATUS_ERROR;
+			continue;
+		}
+
+		/*
+		 * Mark the transaction as being aborted
+		 */
+		GTM_RWLockAcquire(&gtm_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE);
+		gtm_txninfo[ii]->gti_state = GTM_TXN_ABORT_IN_PROGRESS;
+		GTM_RWLockRelease(&gtm_txninfo[ii]->gti_lock);
+		status[ii] = STATUS_OK;
+	}
+
+	GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count);
+
+	return txn_count;
+}
+
+/*
+ * Rollback a transaction
+ */
+int
+GTM_RollbackTransaction(GTM_TransactionHandle txn)
+{
+	int status;
+	GTM_RollbackTransactionMulti(&txn, 1, &status);
+	return status;
+}
+
+
+/*
+ * Same as GTM_CommitTransaction but takes GXID as input
+ */
+int
+GTM_CommitTransactionGXID(GlobalTransactionId gxid)
+{
+	GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+	return GTM_CommitTransaction(txn);
+}
+
+/*
+ * Commit multiple transactions in one go
+ */
+int
+GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[])
+{
+	GTM_TransactionInfo *gtm_txninfo[txn_count];
+	int ii;
+   
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]);
+
+		if (gtm_txninfo[ii] == NULL)
+		{
+			status[ii] = STATUS_ERROR;
+			continue;
+		}
+		/*
+		 * Mark the transaction as being aborted
+		 */
+		GTM_RWLockAcquire(&gtm_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE);
+		gtm_txninfo[ii]->gti_state = GTM_TXN_COMMIT_IN_PROGRESS;
+		GTM_RWLockRelease(&gtm_txninfo[ii]->gti_lock);
+		status[ii] = STATUS_OK;
+	}
+
+	GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count);
+
+	return txn_count;
+}
+
+/*
+ * Commit a transaction
+ */
+int
+GTM_CommitTransaction(GTM_TransactionHandle txn)
+{
+	int status;
+	GTM_CommitTransactionMulti(&txn, 1, &status);
+	return status;
+}
+
+/*
+ * Prepare a transaction
+ */
+int
+GTM_PrepareTransaction(GTM_TransactionHandle txn,
+					   uint32 nodecnt,
+					   PGXC_NodeId nodes[])
+{
+	GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn);
+
+	if (gtm_txninfo == NULL)
+		return STATUS_ERROR;
+
+	/*
+	 * Mark the transaction as being aborted
+	 */
+	GTM_RWLockAcquire(&gtm_txninfo->gti_lock, GTM_LOCKMODE_WRITE);
+	
+	gtm_txninfo->gti_state = GTM_TXN_PREPARE_IN_PROGRESS;
+	gtm_txninfo->gti_nodecount = nodecnt;
+	if (gtm_txninfo->gti_nodes == NULL)
+		gtm_txninfo->gti_nodes = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES);
+	memcpy(gtm_txninfo->gti_nodes, nodes, sizeof (PGXC_NodeId) * nodecnt);
+
+	GTM_RWLockRelease(&gtm_txninfo->gti_lock);
+
+	return STATUS_OK;
+}
+
+/*
+ * Same as GTM_PrepareTransaction but takes GXID as input
+ */
+int
+GTM_PrepareTransactionGXID(GlobalTransactionId gxid,
+					   uint32 nodecnt,
+					   PGXC_NodeId nodes[])
+{
+	GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+	return GTM_PrepareTransaction(txn, nodecnt, nodes);
+}
+
+/*
+ * Get status of the given transaction
+ */
+GTM_TransactionStates
+GTM_GetStatus(GTM_TransactionHandle txn)
+{
+	GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn);
+	return gtm_txninfo->gti_state;
+}
+
+/*
+ * Same as GTM_GetStatus but takes GXID as input
+ */
+GTM_TransactionStates
+GTM_GetStatusGXID(GlobalTransactionId gxid)
+{
+	GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid);
+	return GTM_GetStatus(txn);
+}
+
+/*
+ * Process MSG_TXN_BEGIN message
+ */
+void
+ProcessBeginTransactionCommand(Port *myport, StringInfo message)
+{
+	GTM_IsolationLevel txn_isolation_level;
+	bool txn_read_only;
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	MemoryContext oldContext;
+
+	txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+	txn_read_only = pq_getmsgbyte(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Start a new transaction
+	 *
+	 * XXX Port should contain Coordinator Id - replace 0 with that
+	 */
+	txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+	if (txn == InvalidTransactionHandle)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to start a new transaction")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_BEGIN_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn, sizeof(txn));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID message
+ */
+void
+ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message)
+{
+	GTM_IsolationLevel txn_isolation_level;
+	bool txn_read_only;
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	MemoryContext oldContext;
+
+	txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+	txn_read_only = pq_getmsgbyte(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Start a new transaction
+	 *
+	 * XXX Port should contain Coordinator Id - replace 0 with that
+	 */
+	txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+	if (txn == InvalidTransactionHandle)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to start a new transaction")));
+
+	gxid = GTM_GetGlobalTransactionId(txn);
+	if (gxid == InvalidGlobalTransactionId)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get a new transaction id")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	elog(LOG, "Sending transaction id %u", gxid);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID_AUTOVACUUM message
+ */
+void
+ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message)
+{
+	GTM_IsolationLevel txn_isolation_level;
+	bool txn_read_only;
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	MemoryContext oldContext;
+
+	elog(DEBUG3, "Inside ProcessBeginTransactionGetGXIDAutovacuumCommand");
+
+	txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+	txn_read_only = pq_getmsgbyte(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Start a new transaction
+	 *
+	 * XXX Port should contain Coordinator Id - replace 0 with that
+	 */
+	txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only);
+	if (txn == InvalidTransactionHandle)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to start a new transaction")));
+
+	gxid = GTM_GetGlobalTransactionId(txn);
+	if (gxid == InvalidGlobalTransactionId)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get a new transaction id")));
+
+	/* Indicate that it is for autovacuum */
+	GTM_SetDoVacuum(txn);
+
+	MemoryContextSwitchTo(oldContext);
+
+	elog(DEBUG3, "Sending transaction id %d", gxid);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_BEGIN_GETGXID_MULTI message
+ */
+void
+ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message)
+{
+	GTM_IsolationLevel txn_isolation_level[GTM_MAX_GLOBAL_TRANSACTIONS];
+	bool txn_read_only[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int txn_count;
+	StringInfoData buf;
+	GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+	GlobalTransactionId gxid, end_gxid;
+	GTMProxy_ConnID txn_connid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	MemoryContext oldContext;
+	int count;
+	int ii;
+
+	txn_count = pq_getmsgint(message, sizeof (int));
+
+	if (txn_count <= 0)
+		elog(PANIC, "Zero or less transaction count");
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		txn_isolation_level[ii] = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+		txn_read_only[ii] = pq_getmsgbyte(message);
+		txn_connid[ii] = pq_getmsgint(message, sizeof (GTMProxy_ConnID));
+	}
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Start a new transaction
+	 *
+	 * XXX Port should contain Coordinator Id - replace 0 with that
+	 */
+	count = GTM_BeginTransactionMulti(0, txn_isolation_level, txn_read_only, txn_connid,
+									  txn_count, txn);
+	if (count != txn_count)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to start %d new transactions", txn_count)));
+
+	gxid = GTM_GetGlobalTransactionIdMulti(txn, txn_count);
+	if (gxid == InvalidGlobalTransactionId)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get a new transaction id")));
+
+	MemoryContextSwitchTo(oldContext);
+	
+	end_gxid = gxid + txn_count;
+	if (end_gxid < gxid)
+		end_gxid += FirstNormalGlobalTransactionId;
+
+	elog(LOG, "Sending transaction ids from %u to %u", gxid, end_gxid);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_BEGIN_GETGXID_MULTI_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_COMMIT message
+ */
+void
+ProcessCommitTransactionCommand(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	int isgxid = 0;
+	MemoryContext oldContext;
+	int status = STATUS_OK;
+
+	isgxid = pq_getmsgbyte(message);
+
+	if (isgxid)
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (gxid));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid GXID")));
+		memcpy(&gxid, data, sizeof (gxid));
+		txn = GTM_GXIDToHandle(gxid);
+	}
+	else
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (txn));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid Transaction Handle")));
+		memcpy(&txn, data, sizeof (txn));
+	}
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Commit the transaction
+	 */
+	status = GTM_CommitTransaction(txn);
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_COMMIT_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_sendint(&buf, status, sizeof(status));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_ROLLBACK message
+ */
+void
+ProcessRollbackTransactionCommand(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	int isgxid = 0;
+	MemoryContext oldContext;
+	int status = STATUS_OK;
+
+	isgxid = pq_getmsgbyte(message);
+
+	if (isgxid)
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (gxid));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid GXID")));
+		memcpy(&gxid, data, sizeof (gxid));
+		txn = GTM_GXIDToHandle(gxid);
+	}
+	else
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (txn));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid Transaction Handle")));
+		memcpy(&txn, data, sizeof (txn));
+	}
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Commit the transaction
+	 */
+	status = GTM_RollbackTransaction(txn);
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_sendint(&buf, status, sizeof(status));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+
+/*
+ * Process MSG_TXN_COMMIT_MULTI message
+ */
+void
+ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+	GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	MemoryContext oldContext;
+	int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int txn_count, count;
+	int ii;
+
+	txn_count = pq_getmsgint(message, sizeof (int));
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		isgxid[ii] = pq_getmsgbyte(message);
+		if (isgxid[ii])
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid GXID")));
+			memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+			txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+			elog(DEBUG1, "ProcessCommitTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]);
+		}
+		else
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid Transaction Handle")));
+			memcpy(&txn[ii], data, sizeof (txn[ii]));
+			elog(DEBUG1, "ProcessCommitTransactionCommandMulti: handle(%u)", txn[ii]);
+		}
+	}
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Commit the transaction
+	 */
+	count = GTM_CommitTransactionMulti(txn, txn_count, status);
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_COMMIT_MULTI_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+	pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_ROLLBACK_MULTI message
+ */
+void
+ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS];
+	GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+	MemoryContext oldContext;
+	int status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	int txn_count, count;
+	int ii;
+
+	txn_count = pq_getmsgint(message, sizeof (int));
+
+	for (ii = 0; ii < txn_count; ii++)
+	{
+		isgxid[ii] = pq_getmsgbyte(message);
+		if (isgxid[ii])
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (gxid[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid GXID")));
+			memcpy(&gxid[ii], data, sizeof (gxid[ii]));
+			txn[ii] = GTM_GXIDToHandle(gxid[ii]);
+			elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]);
+		}
+		else
+		{
+			const char *data = pq_getmsgbytes(message, sizeof (txn[ii]));
+			if (data == NULL)
+				ereport(ERROR,
+						(EPROTO,
+						 errmsg("Message does not contain valid Transaction Handle")));
+			memcpy(&txn[ii], data, sizeof (txn[ii]));
+			elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: handle(%u)", txn[ii]);
+		}
+	}
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Commit the transaction
+	 */
+	count = GTM_RollbackTransactionMulti(txn, txn_count, status);
+
+	MemoryContextSwitchTo(oldContext);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_ROLLBACK_MULTI_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
+	pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_PREPARE message
+ */
+void
+ProcessPrepareTransactionCommand(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	int isgxid = 0;
+	int nodecnt;
+	PGXC_NodeId *nodes;
+	MemoryContext oldContext;
+
+	isgxid = pq_getmsgbyte(message);
+
+	if (isgxid)
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (gxid));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid GXID")));
+		memcpy(&gxid, data, sizeof (gxid));
+		txn = GTM_GXIDToHandle(gxid);
+	}
+	else
+	{
+		const char *data = pq_getmsgbytes(message, sizeof (txn));
+		if (data == NULL)
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Message does not contain valid Transaction Handle")));
+		memcpy(&txn, data, sizeof (txn));
+	}
+
+	nodecnt = pq_getmsgint(message, sizeof (nodecnt));
+	nodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * nodecnt);
+	memcpy(nodes, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * nodecnt),
+			sizeof (PGXC_NodeId) * nodecnt);
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Prepare the transaction
+	 */
+	if (GTM_PrepareTransaction(txn, nodecnt, nodes) != STATUS_OK)
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to commit the transaction")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	pfree(nodes);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_PREPARE_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+/*
+ * Process MSG_TXN_GET_GXID message
+ */
+void
+ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message)
+{
+	StringInfoData buf;
+	GTM_TransactionHandle txn;
+	GlobalTransactionId gxid;
+	const char *data;
+	MemoryContext oldContext;
+
+	elog(DEBUG3, "Inside ProcessGetGXIDTransactionCommand");
+
+	data = pq_getmsgbytes(message, sizeof (txn));
+	if (data == NULL)
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("Message does not contain valid Transaction Handle")));
+	memcpy(&txn, data, sizeof (txn));
+
+	pq_getmsgend(message);
+
+	oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Get the transaction id for the given global transaction
+	 */
+	gxid = GTM_GetGlobalTransactionId(txn);
+	if (GlobalTransactionIdIsValid(gxid))
+		ereport(ERROR,
+				(EINVAL,
+				 errmsg("Failed to get the transaction id")));
+
+	MemoryContextSwitchTo(oldContext);
+
+	elog(DEBUG3, "Sending transaction id %d", gxid);
+
+	pq_beginmessage(&buf, 'S');
+	pq_sendint(&buf, TXN_GET_GXID_RESULT, 4);
+	if (myport->is_proxy)
+	{
+		GTM_ProxyMsgHeader proxyhdr;
+		proxyhdr.ph_conid = myport->conn_id;
+		pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	}
+	pq_sendbytes(&buf, (char *)&txn, sizeof(txn));
+	pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid));
+	pq_endmessage(myport, &buf);
+
+	if (!myport->is_proxy)
+		pq_flush(myport);
+	return;
+}
+
+
+/*
+ * Mark GTM as shutting down. This point onwards no new GXID are issued to
+ * ensure that the last GXID recorded in the control file remains sane
+ */
+void
+GTM_SetShuttingDown(void)
+{
+	GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+	GTMTransactions.gt_gtm_state = GTM_SHUTTING_DOWN;
+	GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+}
+
+void
+GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid)
+{
+	GlobalTransactionId saved_gxid;
+
+	if (ctlfd != -1)
+	{
+		if ((read(ctlfd, &saved_gxid, sizeof (saved_gxid)) != sizeof (saved_gxid)) &&
+			(!GlobalTransactionIdIsValid(next_gxid)))
+			return;
+		if (!GlobalTransactionIdIsValid(next_gxid))
+			next_gxid = saved_gxid;
+	}
+
+	elog(LOG, "Restoring last GXID to %u\n", next_gxid);
+
+	if (GlobalTransactionIdIsValid(next_gxid))
+		SetNextGlobalTransactionId(next_gxid);
+	/* Set this otherwise a strange snapshot might be returned for the first one */
+	GTMTransactions.gt_latestCompletedXid = next_gxid - 1;
+	return;
+}
+
+void
+GTM_SaveTxnInfo(int ctlfd)
+{
+	GlobalTransactionId next_gxid;
+
+	next_gxid = ReadNewGlobalTransactionId();
+
+	elog(LOG, "Saving transaction info - next_gxid: %u", next_gxid);
+
+	write(ctlfd, &next_gxid, sizeof (next_gxid));
+}
+/*
+ * TODO
+ */
+int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt);
+
+/*
+ * TODO
+ */
+uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt);
+
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
new file mode 100644
index 0000000000..0ef09c436a
--- /dev/null
+++ b/src/gtm/main/main.c
@@ -0,0 +1,1370 @@
+/*-------------------------------------------------------------------------
+ *
+ * main.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <stdio.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/elog.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/pqsignal.h"
+#include "gtm/pqformat.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/gtm_msg.h"
+
+extern int	optind;
+extern char *optarg;
+
+#define GTM_MAX_PATH			1024
+#define GTM_DEFAULT_HOSTNAME	"*"
+#define GTM_DEFAULT_PORT		6666
+#define GTM_CONTROL_FILE		"gtm.control"
+#define GTM_PID_FILE			"gtm.pid"
+#define GTM_LOG_FILE			"gtm.log"
+
+static char *progname = "gtm";
+char	   *ListenAddresses;
+int			GTMPortNumber;
+char		GTMControlFile[GTM_MAX_PATH];
+char		*GTMDataDir;
+
+/* The socket(s) we're listening to. */
+#define MAXLISTEN	64
+static int	ListenSocket[MAXLISTEN];
+
+pthread_key_t	threadinfo_key;
+static bool		GTMAbortPending = false;
+
+static Port *ConnCreate(int serverFd);
+static int ServerLoop(void);
+static int initMasks(fd_set *rmask);
+void *GTM_ThreadMain(void *argp);
+static int GTMAddConnection(Port *port);
+static int ReadCommand(Port *myport, StringInfo inBuf);
+
+static void ProcessCommand(Port *myport, StringInfo input_message);
+static void ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+static void ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
+
+static void GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id);
+static void GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id);
+	
+static bool CreateOptsFile(int argc, char *argv[]);
+static void CreateDataDirLockFile(void);
+static void CreateLockFile(const char *filename, const char *refName);
+static void ChangeToDataDir(void);
+static void checkDataDir(void);
+static void DeleteLockFile(const char *filename);
+
+/*
+ * One-time initialization. It's called immediately after the main process
+ * starts
+ */ 
+static GTM_ThreadInfo *
+MainThreadInit()
+{
+	GTM_ThreadInfo *thrinfo;
+
+	pthread_key_create(&threadinfo_key, NULL);
+	
+	/*
+	 * Initialize the lock protecting the global threads info
+	 */
+	GTM_RWLockInit(&GTMThreads->gt_lock);
+
+	/*
+	 * We are called even before memory context management is setup. We must
+	 * use malloc
+	 */
+	thrinfo = (GTM_ThreadInfo *)malloc(sizeof (GTM_ThreadInfo));
+
+	if (thrinfo == NULL)
+	{
+		fprintf(stderr, "malloc failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+
+	if (SetMyThreadInfo(thrinfo))
+	{
+		fprintf(stderr, "SetMyThreadInfo failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+
+	return thrinfo;
+}
+
+static void
+BaseInit()
+{
+	GTM_ThreadInfo *thrinfo;
+
+	thrinfo = MainThreadInit();
+
+	MyThreadID = pthread_self();
+
+	MemoryContextInit();
+
+	checkDataDir();
+	ChangeToDataDir();
+	CreateDataDirLockFile();
+
+	sprintf(GTMControlFile, "%s/%s", GTMDataDir, GTM_CONTROL_FILE);
+	if (GTMLogFile == NULL)
+	{
+		GTMLogFile = (char *) malloc(GTM_MAX_PATH);
+		sprintf(GTMLogFile, "%s/%s", GTMDataDir, GTM_LOG_FILE);
+	}
+
+	DebugFileOpen();
+
+	GTM_InitTxnManager();
+	GTM_InitSeqManager();
+
+	/*
+	 * The memory context is now set up.
+	 * Add the thrinfo structure in the global array
+	 */
+	if (GTM_ThreadAdd(thrinfo) == -1)
+	{
+		fprintf(stderr, "GTM_ThreadAdd for main thread failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+}
+
+static void
+GTM_SigleHandler(int signal)
+{
+	fprintf(stderr, "Received signal %d", signal);
+
+	switch (signal)
+	{
+		case SIGKILL:
+		case SIGTERM:
+		case SIGQUIT:
+		case SIGINT:
+		case SIGHUP:
+			break;
+
+		default:
+			fprintf(stderr, "Unknown signal %d\n", signal);
+			return;
+	}
+
+	/*
+	 * XXX We should do a clean shutdown here.
+	 */
+	/* Delete pid file before shutting down */
+	DeleteLockFile(GTM_PID_FILE);
+
+	PG_SETMASK(&BlockSig);
+	GTMAbortPending = true;
+
+	return;
+}
+
+/*
+ * Help display should match 
+ */
+static void
+help(const char *progname)
+{
+	printf(_("This is the GTM server.\n\n"));
+	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -h hostname     GTM server hostname/IP\n"));
+	printf(_("  -p port			GTM server port number\n"));
+	printf(_("  -x xid			Starting GXID \n"));
+	printf(_("  -D directory	GTM working directory\n"));
+	printf(_("  -l filename		GTM server log file name \n"));
+	printf(_("  --help          show this help, then exit\n"));
+}
+
+int
+main(int argc, char *argv[])
+{
+	int			opt;
+	int			status;
+	int			i;
+	GlobalTransactionId next_gxid = InvalidGlobalTransactionId;
+	int			ctlfd;
+
+	/*
+	 * Catch standard options before doing much else
+	 */
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			help(argv[0]);
+			exit(0);
+		}
+	}
+
+	ListenAddresses = GTM_DEFAULT_HOSTNAME;
+	GTMPortNumber = GTM_DEFAULT_PORT;
+	
+	/*
+	 * Parse the command like options and set variables
+	 */
+	while ((opt = getopt(argc, argv, "h:p:x:D:l:")) != -1)
+	{
+		switch (opt)
+		{
+			case 'h':
+				ListenAddresses = strdup(optarg);
+				break;
+
+			case 'p':
+				GTMPortNumber = atoi(optarg);
+				break;
+
+			case 'x':
+				next_gxid = (GlobalTransactionId )atoll(optarg);
+				break;
+
+			case 'D':
+				GTMDataDir = strdup(optarg);
+				canonicalize_path(GTMDataDir);
+				break;
+
+			case 'l':
+				GTMLogFile = strdup(optarg);
+				break;
+
+			default:
+				write_stderr("Try \"%s --help\" for more information.\n",
+							 progname);
+		}
+	}
+
+	if (GTMDataDir == NULL)
+	{
+		write_stderr("GTM data directory must be specified\n");
+		write_stderr("Try \"%s --help\" for more information.\n",
+					 progname);
+		exit(1);
+	}
+	/*
+	 * GTM accepts no non-option switch arguments.
+	 */
+	if (optind < argc)
+	{
+		write_stderr("%s: invalid argument: \"%s\"\n",
+					 progname, argv[optind]);
+		write_stderr("Try \"%s --help\" for more information.\n",
+					 progname);
+		exit(1);
+	}
+
+	/*
+	 * Some basic initialization must happen before we do anything
+	 * useful
+	 */
+	BaseInit();
+
+	elog(DEBUG3, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile);
+
+	/*
+	 * Read the last GXID and start from there
+	 */
+
+	ctlfd = open(GTMControlFile, O_RDONLY);
+
+	GTM_RestoreTxnInfo(ctlfd, next_gxid);
+	GTM_RestoreSeqInfo(ctlfd);
+
+	close(ctlfd);
+	/*
+	 * Establish input sockets.
+	 */
+	for (i = 0; i < MAXLISTEN; i++)
+		ListenSocket[i] = -1;
+
+	if (ListenAddresses)
+	{
+		int			success = 0;
+
+			status = StreamServerPort(AF_UNSPEC, ListenAddresses,
+									  (unsigned short) GTMPortNumber,
+									  ListenSocket, MAXLISTEN);
+		if (status == STATUS_OK)
+			success++;
+		else
+			ereport(FATAL,
+					(errmsg("could not create listen socket for \"%s\"",
+							ListenAddresses)));
+	}
+
+	/*
+	 * check that we have some socket to listen on
+	 */
+	if (ListenSocket[0] == -1)
+		ereport(FATAL,
+				(errmsg("no socket created for listening")));
+
+	/*
+	 * Record gtm options.  We delay this till now to avoid recording
+	 * bogus options
+	 */
+	if (!CreateOptsFile(argc, argv))
+		exit(1);
+
+	pqsignal(SIGHUP, GTM_SigleHandler);
+	pqsignal(SIGKILL, GTM_SigleHandler);
+	pqsignal(SIGQUIT, GTM_SigleHandler);
+	pqsignal(SIGTERM, GTM_SigleHandler);
+	pqsignal(SIGINT, GTM_SigleHandler);
+
+	pqinitmask();
+
+	/*
+	 * Accept any new connections. Fork a new thread for each incoming
+	 * connection
+	 */
+	status = ServerLoop();
+
+	/*
+	 * ServerLoop probably shouldn't ever return, but if it does, close down.
+	 */
+	exit(status != STATUS_OK);
+
+	return 0;					/* not reached */
+}
+
+/*
+ * ConnCreate -- create a local connection data structure
+ */
+static Port *
+ConnCreate(int serverFd)
+{
+	Port	   *port;
+
+	if (!(port = (Port *) calloc(1, sizeof(Port))))
+	{
+		ereport(LOG,
+				(ENOMEM,
+				 errmsg("out of memory")));
+		exit(1);
+	}
+
+	if (StreamConnection(serverFd, port) != STATUS_OK)
+	{
+		if (port->sock >= 0)
+			StreamClose(port->sock);
+		ConnFree(port);
+		port = NULL;
+	}
+
+	port->conn_id = InvalidGTMProxyConnID;
+	return port;
+}
+
+/*
+ * ConnFree -- free a local connection data structure
+ */
+void
+ConnFree(Port *conn)
+{
+	free(conn);
+}
+
+/*
+ * Main idle loop of postmaster
+ */
+static int
+ServerLoop(void)
+{
+	fd_set		readmask;
+	int			nSockets;
+
+	nSockets = initMasks(&readmask);
+
+	for (;;)
+	{
+		fd_set		rmask;
+		int			selres;
+
+		//MemoryContextStats(TopMostMemoryContext);
+		
+		/*
+		 * Wait for a connection request to arrive.
+		 *
+		 * We wait at most one minute, to ensure that the other background
+		 * tasks handled below get done even when no requests are arriving.
+		 */
+		memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+		PG_SETMASK(&UnBlockSig);
+
+		if (GTMAbortPending)
+		{
+			int ctlfd;
+
+			/*
+			 * XXX We should do a clean shutdown here. For the time being, just
+			 * write the next GXID to be issued in the control file and exit
+			 * gracefully
+			 */
+
+			/*
+			 * Tell GTM that we are shutting down so that no new GXIDs are
+			 * issued this point onwards
+			 */
+			GTM_SetShuttingDown();
+
+			ctlfd = open(GTMControlFile, O_WRONLY | O_TRUNC | O_CREAT,
+						 S_IRUSR | S_IWUSR);
+			if (ctlfd == -1)
+			{
+				fprintf(stderr, "Failed to create/open the control file\n");
+				exit(2);
+			}
+
+			GTM_SaveTxnInfo(ctlfd);
+			GTM_SaveSeqInfo(ctlfd);
+
+			close(ctlfd);
+
+			exit(1);
+		}
+
+		{
+			/* must set timeout each time; some OSes change it! */
+			struct timeval timeout;
+
+			timeout.tv_sec = 60;
+			timeout.tv_usec = 0;
+
+			selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+		}
+
+		/*
+		 * Block all signals until we wait again.  (This makes it safe for our
+		 * signal handlers to do nontrivial work.)
+		 */
+		PG_SETMASK(&BlockSig);
+
+		/* Now check the select() result */
+		if (selres < 0)
+		{
+			if (errno != EINTR && errno != EWOULDBLOCK)
+			{
+				ereport(LOG,
+						(EACCES,
+						 errmsg("select() failed in postmaster: %m")));
+				return STATUS_ERROR;
+			}
+		}
+
+		/*
+		 * New connection pending on any of our sockets? If so, fork a child
+		 * process to deal with it.
+		 */
+		if (selres > 0)
+		{
+			int			i;
+
+			for (i = 0; i < MAXLISTEN; i++)
+			{
+				if (ListenSocket[i] == -1)
+					break;
+				if (FD_ISSET(ListenSocket[i], &rmask))
+				{
+					Port	   *port;
+
+					port = ConnCreate(ListenSocket[i]);
+					if (port)
+					{
+						if (GTMAddConnection(port) != STATUS_OK)
+						{
+							elog(ERROR, "Too many connections");
+							StreamClose(port->sock);
+							ConnFree(port);
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Initialise the masks for select() for the ports we are listening on.
+ * Return the number of sockets to listen on.
+ */
+static int
+initMasks(fd_set *rmask)
+{
+	int			maxsock = -1;
+	int			i;
+
+	FD_ZERO(rmask);
+
+	for (i = 0; i < MAXLISTEN; i++)
+	{
+		int			fd = ListenSocket[i];
+
+		if (fd == -1)
+			break;
+		FD_SET(fd, rmask);
+		if (fd > maxsock)
+			maxsock = fd;
+	}
+
+	return maxsock + 1;
+}
+
+
+void *
+GTM_ThreadMain(void *argp)
+{
+	GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
+	int qtype;
+	StringInfoData input_message;
+	sigjmp_buf  local_sigjmp_buf;
+
+	elog(DEBUG3, "Starting the connection helper thread");
+	
+
+	/*
+	 * Create the memory context we will use in the main loop.
+	 *
+	 * MessageContext is reset once per iteration of the main loop, ie, upon
+	 * completion of processing of each command message from the client.
+	 *
+	 * This context is thread-specific
+	 */
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_MINSIZE,
+										   ALLOCSET_DEFAULT_INITSIZE,
+										   ALLOCSET_DEFAULT_MAXSIZE,
+										   false);
+	
+
+	{
+		/*
+		 * We expect a startup message at the very start. The message type is
+		 * REGISTER_COORD, followed by the 4 byte coordinator ID
+		 */
+		char startup_type;
+		GTM_StartupPacket sp;
+		StringInfoData inBuf;
+
+		startup_type = pq_getbyte(thrinfo->thr_conn->con_port);
+
+		if (startup_type != 'A')
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Expecting a startup message, but received %c",
+						 startup_type)));
+
+		initStringInfo(&inBuf);
+		
+		/*
+		 * All frontend messages have a length word next
+		 * after the type code; we can read the message contents independently of
+		 * the type.
+		 */
+		if (pq_getmessage(thrinfo->thr_conn->con_port, &inBuf, 0))
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("Expecting coordinator ID, but received EOF")));
+
+		memcpy(&sp,
+			   pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)),
+			   sizeof (GTM_StartupPacket));
+		pq_getmsgend(&inBuf);
+
+		GTM_RegisterCoordinator(thrinfo->thr_conn->con_port, sp.sp_cid);
+		thrinfo->thr_conn->con_port->is_proxy = sp.sp_isproxy;
+	}
+
+	{
+		/*
+		 * Send a dummy authentication request message 'R' as the client
+		 * expects that in the current protocol
+		 */
+		StringInfoData buf;
+		pq_beginmessage(&buf, 'R');
+		pq_endmessage(thrinfo->thr_conn->con_port, &buf);
+		pq_flush(thrinfo->thr_conn->con_port);
+
+		elog(DEBUG3, "Sent connection authentication message to the client");
+	}
+
+	/*
+	 * Get the input_message in the TopMemoryContext so that we don't need to
+	 * free/palloc it for every incoming message. Unlike Postgres, we don't
+	 * expect the incoming messages to be of arbitrary sizes
+	 */
+
+	initStringInfo(&input_message);
+
+	/*
+	 * POSTGRES main processing loop begins here
+	 *
+	 * If an exception is encountered, processing resumes here so we abort the
+	 * current transaction and start a new one.
+	 *
+	 * You might wonder why this isn't coded as an infinite loop around a
+	 * PG_TRY construct.  The reason is that this is the bottom of the
+	 * exception stack, and so with PG_TRY there would be no exception handler
+	 * in force at all during the CATCH part.  By leaving the outermost setjmp
+	 * always active, we have at least some chance of recovering from an error
+	 * during error recovery.  (If we get into an infinite loop thereby, it
+	 * will soon be stopped by overflow of elog.c's internal state stack.)
+	 */
+
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/*
+		 * NOTE: if you are tempted to add more code in this if-block,
+		 * consider the high probability that it should be in
+		 * AbortTransaction() instead.	The only stuff done directly here
+		 * should be stuff that is guaranteed to apply *only* for outer-level
+		 * error recovery, such as adjusting the FE/BE protocol status.
+		 */
+		
+		/* Report the error to the client and/or server log */
+		if (thrinfo->thr_conn)
+			EmitErrorReport(thrinfo->thr_conn->con_port);
+		else
+			EmitErrorReport(NULL);
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(TopMemoryContext);
+		FlushErrorState();
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+
+	for (;;)
+	{
+		/*
+		 * Release storage left over from prior query cycle, and create a new
+		 * query input buffer in the cleared MessageContext.
+		 */
+		MemoryContextSwitchTo(MessageContext);
+		MemoryContextResetAndDeleteChildren(MessageContext);
+
+		/*
+		 * Just reset the input buffer to avoid repeated palloc/pfrees
+		 *
+		 * XXX We should consider resetting the MessageContext periodically to
+		 * handle any memory leaks
+		 */
+		resetStringInfo(&input_message);
+
+		/*
+		 * (3) read a command (loop blocks here)
+		 */
+		qtype = ReadCommand(thrinfo->thr_conn->con_port, &input_message);
+
+		switch(qtype)
+		{
+			case 'C':
+				ProcessCommand(thrinfo->thr_conn->con_port, &input_message);
+				break;
+			
+			case 'X':
+			case EOF:
+				/*
+				 * Connection termination request
+				 * Remove all transactions opened within the thread 
+				 */
+				GTM_RemoveAllTransInfos(-1);
+				pthread_exit(thrinfo);
+				break;
+			
+			case 'F':
+				/*
+				 * Flush all the outgoing data on the wire. Consume the message
+				 * type field for sanity
+				 */
+				pq_getmsgint(&input_message, sizeof (GTM_MessageType));
+				pq_getmsgend(&input_message);
+				pq_flush(thrinfo->thr_conn->con_port);
+				break;
+
+			default:
+				/*
+				 * Remove all transactions opened within the thread 
+				 */
+				GTM_RemoveAllTransInfos(-1);
+
+				ereport(FATAL,
+						(EPROTO,
+						 errmsg("invalid frontend message type %d",
+								qtype)));
+				break;
+		}
+		
+	}
+
+	/* can't get here because the above loop never exits */
+	Assert(false);
+
+	return thrinfo;
+}
+
+void
+ProcessCommand(Port *myport, StringInfo input_message)
+{
+	GTM_MessageType mtype;
+	GTM_ProxyMsgHeader proxyhdr;
+
+	if (myport->is_proxy)
+		pq_copymsgbytes(input_message, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+	else
+		proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+	myport->conn_id = proxyhdr.ph_conid;
+	mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType));
+
+	switch (mtype)
+	{
+		case MSG_UNREGISTER_COORD:	
+			ProcessCoordinatorCommand(myport, mtype, input_message);
+			break;
+
+		case MSG_TXN_BEGIN:	
+		case MSG_TXN_BEGIN_GETGXID:	
+		case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:	
+		case MSG_TXN_PREPARE:		
+		case MSG_TXN_COMMIT:		
+		case MSG_TXN_ROLLBACK:		
+		case MSG_TXN_GET_GXID:
+		case MSG_TXN_BEGIN_GETGXID_MULTI:
+		case MSG_TXN_COMMIT_MULTI:
+		case MSG_TXN_ROLLBACK_MULTI:
+			ProcessTransactionCommand(myport, mtype, input_message);
+			break;
+
+		case MSG_SNAPSHOT_GET:		
+		case MSG_SNAPSHOT_GXID_GET:
+		case MSG_SNAPSHOT_GET_MULTI:
+			ProcessSnapshotCommand(myport, mtype, input_message);
+			break;
+
+		case MSG_SEQUENCE_INIT:	
+		case MSG_SEQUENCE_GET_CURRENT:
+		case MSG_SEQUENCE_GET_NEXT:
+		case MSG_SEQUENCE_RESET:
+		case MSG_SEQUENCE_CLOSE:
+			ProcessSeqeunceCommand(myport, mtype, input_message);
+			break;
+		
+		case MSG_TXN_GET_STATUS:
+		case MSG_TXN_GET_ALL_PREPARED:
+			ProcessQueryCommand(myport, mtype, input_message);
+			break;
+
+		case MSG_BACKEND_DISCONNECT:
+			GTM_RemoveAllTransInfos(proxyhdr.ph_conid);
+			break;
+
+		default:
+			ereport(FATAL,
+					(EPROTO,
+					 errmsg("invalid frontend message type %d",
+							mtype)));
+	}
+}
+
+static int
+GTMAddConnection(Port *port)
+{
+	GTM_ConnectionInfo *conninfo = NULL;
+
+	conninfo = (GTM_ConnectionInfo *)palloc(sizeof (GTM_ConnectionInfo));
+
+	if (conninfo == NULL)
+	{
+		ereport(ERROR,
+				(ENOMEM,
+				 	errmsg("Out of memory")));
+		return STATUS_ERROR;
+	}
+		
+	elog(DEBUG3, "Started new connection");
+	conninfo->con_port = port;
+
+	/*
+	 * XXX Start the thread
+	 */
+	if (GTM_ThreadCreate(conninfo, GTM_ThreadMain) == NULL)
+	{
+		elog(ERROR, "failed to create a new thread");
+		return STATUS_ERROR;
+	}
+
+	return STATUS_OK;
+}
+
+/* ----------------
+ *		ReadCommand reads a command from either the frontend or
+ *		standard input, places it in inBuf, and returns the
+ *		message type code (first byte of the message).
+ *		EOF is returned if end of file.
+ * ----------------
+ */
+static int
+ReadCommand(Port *myport, StringInfo inBuf)
+{
+	int 			qtype;
+
+	/*
+	 * Get message type code from the frontend.
+	 */
+	qtype = pq_getbyte(myport);
+
+	if (qtype == EOF)			/* frontend disconnected */
+	{
+		ereport(COMMERROR,
+				(EPROTO,
+				 errmsg("unexpected EOF on client connection")));
+		return EOF;
+	}
+
+	/*
+	 * Validate message type code before trying to read body; if we have lost
+	 * sync, better to say "command unknown" than to run out of memory because
+	 * we used garbage as a length word.
+	 *
+	 * This also gives us a place to set the doing_extended_query_message flag
+	 * as soon as possible.
+	 */
+	switch (qtype)
+	{
+		case 'C':
+			break;
+
+		case 'X':
+			break;
+
+		case 'F':
+			break;
+
+		default:
+
+			/*
+			 * Otherwise we got garbage from the frontend.	We treat this as
+			 * fatal because we have probably lost message boundary sync, and
+			 * there's no good way to recover.
+			 */
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("invalid frontend message type %d", qtype)));
+
+			break;
+	}
+
+	/*
+	 * In protocol version 3, all frontend messages have a length word next
+	 * after the type code; we can read the message contents independently of
+	 * the type.
+	 */
+	if (pq_getmessage(myport, inBuf, 0))
+		return EOF;			/* suitable message already logged */
+
+	return qtype;
+}
+
+static void
+ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+	GTM_CoordinatorId cid;
+
+	cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId));
+	
+	switch (mtype)
+	{
+		case MSG_UNREGISTER_COORD:
+			GTM_UnregisterCoordinator(myport, cid);
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+	pq_getmsgend(message);
+}
+
+static void
+ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+	elog(DEBUG1, "ProcessTransactionCommand: mtype:%d", mtype);
+
+	switch (mtype)
+	{
+		case MSG_TXN_BEGIN:	
+			ProcessBeginTransactionCommand(myport, message);
+			break;
+
+		case MSG_TXN_BEGIN_GETGXID:	
+			ProcessBeginTransactionGetGXIDCommand(myport, message);
+			break;
+
+		case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:	
+			ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message);
+			break;
+
+		case MSG_TXN_BEGIN_GETGXID_MULTI:	
+			ProcessBeginTransactionGetGXIDCommandMulti(myport, message);
+			break;
+
+		case MSG_TXN_PREPARE:		
+			ProcessPrepareTransactionCommand(myport, message);
+			break;
+
+		case MSG_TXN_COMMIT:		
+			ProcessCommitTransactionCommand(myport, message);
+			break;
+
+		case MSG_TXN_ROLLBACK:		
+			ProcessRollbackTransactionCommand(myport, message);
+			break;
+
+		case MSG_TXN_COMMIT_MULTI:		
+			ProcessCommitTransactionCommandMulti(myport, message);
+			break;
+
+		case MSG_TXN_ROLLBACK_MULTI:		
+			ProcessRollbackTransactionCommandMulti(myport, message);
+			break;
+
+		case MSG_TXN_GET_GXID:
+			ProcessGetGXIDTransactionCommand(myport, message);
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+}
+
+static void
+ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+	switch (mtype)
+	{
+		case MSG_SNAPSHOT_GET:
+			ProcessGetSnapshotCommand(myport, message, false);
+			break;
+
+		case MSG_SNAPSHOT_GET_MULTI:
+			ProcessGetSnapshotCommandMulti(myport, message);
+			break;
+
+		case MSG_SNAPSHOT_GXID_GET:
+			ProcessGetSnapshotCommand(myport, message, true);
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+
+}
+
+static void
+ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+	switch (mtype)
+	{
+		case MSG_SEQUENCE_INIT:	
+			ProcessSequenceInitCommand(myport, message);
+			break;
+
+		case MSG_SEQUENCE_GET_CURRENT:
+			ProcessSequenceGetCurrentCommand(myport, message);
+			break;
+
+		case MSG_SEQUENCE_GET_NEXT:
+			ProcessSequenceGetNextCommand(myport, message);
+			break;
+
+		case MSG_SEQUENCE_RESET:
+			ProcessSequenceResetCommand(myport, message);
+			break;
+
+		case MSG_SEQUENCE_CLOSE:
+			ProcessSequenceCloseCommand(myport, message);
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+
+}
+
+static void
+ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
+{
+	switch (mtype)
+	{
+		case MSG_TXN_GET_STATUS:
+		case MSG_TXN_GET_ALL_PREPARED:
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+
+}
+
+static void
+GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId cid)
+{
+	elog(DEBUG3, "Registering coordinator with cid %d", cid);
+	myport->coordinator_id = cid;
+}
+
+
+static void
+GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId cid)
+{
+	/*
+	 * Do a clean shutdown
+	 */
+	return;
+}
+
+/*
+ * Validate the proposed data directory
+ */
+static void
+checkDataDir(void)
+{
+	struct stat stat_buf;
+
+	Assert(GTMDataDir);
+
+retry:
+	if (stat(GTMDataDir, &stat_buf) != 0)
+	{
+		if (errno == ENOENT)
+		{
+			if (mkdir(GTMDataDir, 0700) != 0)
+			{
+				ereport(FATAL,
+						(errno,
+						 errmsg("failed to create the directory \"%s\"",
+							 GTMDataDir)));
+			}
+			goto retry;
+		}
+		else
+			ereport(FATAL,
+					(EPERM,
+				 errmsg("could not read permissions of directory \"%s\": %m",
+						GTMDataDir)));
+	}
+
+	/* eventual chdir would fail anyway, but let's test ... */
+	if (!S_ISDIR(stat_buf.st_mode))
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("specified data directory \"%s\" is not a directory",
+						GTMDataDir)));
+
+	/*
+	 * Check that the directory belongs to my userid; if not, reject.
+	 *
+	 * This check is an essential part of the interlock that prevents two
+	 * postmasters from starting in the same directory (see CreateLockFile()).
+	 * Do not remove or weaken it.
+	 *
+	 * XXX can we safely enable this check on Windows?
+	 */
+#if !defined(WIN32) && !defined(__CYGWIN__)
+	if (stat_buf.st_uid != geteuid())
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("data directory \"%s\" has wrong ownership",
+						GTMDataDir),
+				 errhint("The server must be started by the user that owns the data directory.")));
+#endif
+}
+
+/*
+ * Change working directory to DataDir.  Most of the postmaster and backend
+ * code assumes that we are in DataDir so it can use relative paths to access
+ * stuff in and under the data directory.  For convenience during path
+ * setup, however, we don't force the chdir to occur during SetDataDir.
+ */
+static void
+ChangeToDataDir(void)
+{
+	if (chdir(GTMDataDir) < 0)
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("could not change directory to \"%s\": %m",
+						GTMDataDir)));
+}
+
+/*
+ * Create the data directory lockfile.
+ *
+ * When this is called, we must have already switched the working
+ * directory to DataDir, so we can just use a relative path.  This
+ * helps ensure that we are locking the directory we should be.
+ */
+static void
+CreateDataDirLockFile()
+{
+	CreateLockFile(GTM_PID_FILE, GTMDataDir);
+}
+
+/*
+ * Create a lockfile.
+ *
+ * filename is the name of the lockfile to create.
+ * amPostmaster is used to determine how to encode the output PID.
+ * isDDLock and refName are used to determine what error message to produce.
+ */
+static void
+CreateLockFile(const char *filename, const char *refName)
+{
+	int			fd;
+	char		buffer[MAXPGPATH + 100];
+	int			ntries;
+	int			len;
+	int			encoded_pid;
+	pid_t		other_pid;
+	pid_t		my_pid = getpid();
+
+	/*
+	 * We need a loop here because of race conditions.	But don't loop forever
+	 * (for example, a non-writable $PGDATA directory might cause a failure
+	 * that won't go away).  100 tries seems like plenty.
+	 */
+	for (ntries = 0;; ntries++)
+	{
+		/*
+		 * Try to create the lock file --- O_EXCL makes this atomic.
+		 *
+		 * Think not to make the file protection weaker than 0600.	See
+		 * comments below.
+		 */
+		fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600);
+		if (fd >= 0)
+			break;				/* Success; exit the retry loop */
+
+		/*
+		 * Couldn't create the pid file. Probably it already exists.
+		 */
+		if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not create lock file \"%s\": %m",
+							filename)));
+
+		/*
+		 * Read the file to get the old owner's PID.  Note race condition
+		 * here: file might have been deleted since we tried to create it.
+		 */
+		fd = open(filename, O_RDONLY, 0600);
+		if (fd < 0)
+		{
+			if (errno == ENOENT)
+				continue;		/* race condition; try again */
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not open lock file \"%s\": %m",
+							filename)));
+		}
+		if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not read lock file \"%s\": %m",
+							filename)));
+		close(fd);
+
+		buffer[len] = '\0';
+		encoded_pid = atoi(buffer);
+		other_pid = (pid_t) encoded_pid;
+
+		if (other_pid <= 0)
+			elog(FATAL, "bogus data in lock file \"%s\": \"%s\"",
+				 filename, buffer);
+
+		/*
+		 * Check to see if the other process still exists
+		 *
+		 * If the PID in the lockfile is our own PID or our parent's PID, then
+		 * the file must be stale (probably left over from a previous system
+		 * boot cycle).  We need this test because of the likelihood that a
+		 * reboot will assign exactly the same PID as we had in the previous
+		 * reboot.	Also, if there is just one more process launch in this
+		 * reboot than in the previous one, the lockfile might mention our
+		 * parent's PID.  We can reject that since we'd never be launched
+		 * directly by a competing postmaster.	We can't detect grandparent
+		 * processes unfortunately, but if the init script is written
+		 * carefully then all but the immediate parent shell will be
+		 * root-owned processes and so the kill test will fail with EPERM.
+		 *
+		 * We can treat the EPERM-error case as okay because that error
+		 * implies that the existing process has a different userid than we
+		 * do, which means it cannot be a competing postmaster.  A postmaster
+		 * cannot successfully attach to a data directory owned by a userid
+		 * other than its own.	(This is now checked directly in
+		 * checkDataDir(), but has been true for a long time because of the
+		 * restriction that the data directory isn't group- or
+		 * world-accessible.)  Also, since we create the lockfiles mode 600,
+		 * we'd have failed above if the lockfile belonged to another userid
+		 * --- which means that whatever process kill() is reporting about
+		 * isn't the one that made the lockfile.  (NOTE: this last
+		 * consideration is the only one that keeps us from blowing away a
+		 * Unix socket file belonging to an instance of Postgres being run by
+		 * someone else, at least on machines where /tmp hasn't got a
+		 * stickybit.)
+		 *
+		 * Windows hasn't got getppid(), but doesn't need it since it's not
+		 * using real kill() either...
+		 *
+		 * Normally kill() will fail with ESRCH if the given PID doesn't
+		 * exist.
+		 */
+		if (other_pid != my_pid
+#ifndef WIN32
+			&& other_pid != getppid()
+#endif
+			)
+		{
+			if (kill(other_pid, 0) == 0 ||
+				(errno != ESRCH && errno != EPERM))
+			{
+				/* lockfile belongs to a live process */
+				ereport(FATAL,
+						(EINVAL,
+						 errmsg("lock file \"%s\" already exists",
+								filename),
+						  errhint("Is another GTM (PID %d) running in data directory \"%s\"?",
+								  (int) other_pid, refName)));
+			}
+		}
+
+		/*
+		 * Looks like nobody's home.  Unlink the file and try again to create
+		 * it.	Need a loop because of possible race condition against other
+		 * would-be creators.
+		 */
+		if (unlink(filename) < 0)
+			ereport(FATAL,
+					(EACCES,
+					 errmsg("could not remove old lock file \"%s\": %m",
+							filename),
+					 errhint("The file seems accidentally left over, but "
+						   "it could not be removed. Please remove the file "
+							 "by hand and try again.")));
+	}
+
+	/*
+	 * Successfully created the file, now fill it.
+	 */
+	snprintf(buffer, sizeof(buffer), "%d\n%s\n",
+			 (int) my_pid, GTMDataDir);
+	errno = 0;
+	if (write(fd, buffer, strlen(buffer)) != strlen(buffer))
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		unlink(filename);
+		/* if write didn't set errno, assume problem is no disk space */
+		errno = save_errno ? save_errno : ENOSPC;
+		ereport(FATAL,
+				(EACCES,
+				 errmsg("could not write lock file \"%s\": %m", filename)));
+	}
+	if (close(fd))
+	{
+		int			save_errno = errno;
+
+		unlink(filename);
+		errno = save_errno;
+		ereport(FATAL,
+				(EACCES,
+				 errmsg("could not write lock file \"%s\": %m", filename)));
+	}
+}
+
+/*
+ * Create the opts file
+ */
+static bool
+CreateOptsFile(int argc, char *argv[])
+{
+	FILE	   *fp;
+	int			i;
+
+#define OPTS_FILE	"gtm.opts"
+
+	if ((fp = fopen(OPTS_FILE, "w")) == NULL)
+	{
+		elog(LOG, "could not create file \"%s\": %m", OPTS_FILE);
+		return false;
+	}
+
+	for (i = 1; i < argc; i++)
+		fprintf(fp, " \"%s\"", argv[i]);
+	fputs("\n", fp);
+
+	if (fclose(fp))
+	{
+		elog(LOG, "could not write file \"%s\": %m", OPTS_FILE);
+		return false;
+	}
+
+	return true;
+}
+
+/* delete pid file */
+static void
+DeleteLockFile(const char *filename)
+{
+	if (unlink(filename) < 0)
+		ereport(FATAL,
+			(EACCES,
+			 errmsg("could not remove old lock file \"%s\": %m",
+					filename),
+			 errhint("The file seems accidentally left over, but "
+					 "it could not be removed. Please remove the file "
+					 "by hand and try again.")));
+}
diff --git a/src/gtm/path/Makefile b/src/gtm/path/Makefile
new file mode 100644
index 0000000000..802ae3b9f9
--- /dev/null
+++ b/src/gtm/path/Makefile
@@ -0,0 +1,21 @@
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+NAME=gtmpath
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+OBJS=path.o
+
+all:all-lib
+
+include $(top_build_dir)/Makefile.shlib
+
+clean:
+	rm -f $(OBJS)
+	rm -f libgtmpath.so libgtmpath.so.1 libgtmpath.so.1.0 
+
+distclean: clean
+
+maintainer-clean: distclean
+
diff --git a/src/gtm/path/path.c b/src/gtm/path/path.c
new file mode 100644
index 0000000000..ea0eb6dbf2
--- /dev/null
+++ b/src/gtm/path/path.c
@@ -0,0 +1,177 @@
+/*-------------------------------------------------------------------------
+ *
+ * path.c
+ *	  portable path handling routines
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+
+#include <ctype.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gtm/path.h>
+
+#define IS_DIR_SEP(ch)	((ch) == '/' || (ch) == '\\')
+
+#define skip_drive(path)	(path)
+
+static void trim_directory(char *path);
+static void trim_trailing_separator(char *path);
+
+/*
+ *	Clean up path by:
+ *		o  remove trailing slash
+ *		o  remove duplicate adjacent separators
+ *		o  remove trailing '.'
+ *		o  process trailing '..' ourselves
+ */
+void
+canonicalize_path(char *path)
+{
+	char	   *p,
+			   *to_p;
+	char	   *spath;
+	bool		was_sep = false;
+	int			pending_strips;
+
+	/*
+	 * Removing the trailing slash on a path means we never get ugly double
+	 * trailing slashes. Also, Win32 can't stat() a directory with a trailing
+	 * slash. Don't remove a leading slash, though.
+	 */
+	trim_trailing_separator(path);
+
+	/*
+	 * Remove duplicate adjacent separators
+	 */
+	p = path;
+
+	to_p = p;
+	for (; *p; p++, to_p++)
+	{
+		/* Handle many adjacent slashes, like "/a///b" */
+		while (*p == '/' && was_sep)
+			p++;
+		if (to_p != p)
+			*to_p = *p;
+		was_sep = (*p == '/');
+	}
+	*to_p = '\0';
+
+	/*
+	 * Remove any trailing uses of "." and process ".." ourselves
+	 *
+	 * Note that "/../.." should reduce to just "/", while "../.." has to be
+	 * kept as-is.	In the latter case we put back mistakenly trimmed ".."
+	 * components below.  Also note that we want a Windows drive spec to be
+	 * visible to trim_directory(), but it's not part of the logic that's
+	 * looking at the name components; hence distinction between path and
+	 * spath.
+	 */
+	spath = skip_drive(path);
+	pending_strips = 0;
+	for (;;)
+	{
+		int			len = strlen(spath);
+
+		if (len >= 2 && strcmp(spath + len - 2, "/.") == 0)
+			trim_directory(path);
+		else if (strcmp(spath, ".") == 0)
+		{
+			/* Want to leave "." alone, but "./.." has to become ".." */
+			if (pending_strips > 0)
+				*spath = '\0';
+			break;
+		}
+		else if ((len >= 3 && strcmp(spath + len - 3, "/..") == 0) ||
+				 strcmp(spath, "..") == 0)
+		{
+			trim_directory(path);
+			pending_strips++;
+		}
+		else if (pending_strips > 0 && *spath != '\0')
+		{
+			/* trim a regular directory name cancelled by ".." */
+			trim_directory(path);
+			pending_strips--;
+			/* foo/.. should become ".", not empty */
+			if (*spath == '\0')
+				strcpy(spath, ".");
+		}
+		else
+			break;
+	}
+
+	if (pending_strips > 0)
+	{
+		/*
+		 * We could only get here if path is now totally empty (other than a
+		 * possible drive specifier on Windows). We have to put back one or
+		 * more ".."'s that we took off.
+		 */
+		while (--pending_strips > 0)
+			strcat(path, "../");
+		strcat(path, "..");
+	}
+}
+
+/*
+ *	trim_directory
+ *
+ *	Trim trailing directory from path, that is, remove any trailing slashes,
+ *	the last pathname component, and the slash just ahead of it --- but never
+ *	remove a leading slash.
+ */
+static void
+trim_directory(char *path)
+{
+	char	   *p;
+
+	path = skip_drive(path);
+
+	if (path[0] == '\0')
+		return;
+
+	/* back up over trailing slash(es) */
+	for (p = path + strlen(path) - 1; IS_DIR_SEP(*p) && p > path; p--)
+		;
+	/* back up over directory name */
+	for (; !IS_DIR_SEP(*p) && p > path; p--)
+		;
+	/* if multiple slashes before directory name, remove 'em all */
+	for (; p > path && IS_DIR_SEP(*(p - 1)); p--)
+		;
+	/* don't erase a leading slash */
+	if (p == path && IS_DIR_SEP(*p))
+		p++;
+	*p = '\0';
+}
+
+/*
+ *	trim_trailing_separator
+ *
+ * trim off trailing slashes, but not a leading slash
+ */
+static void
+trim_trailing_separator(char *path)
+{
+	char	   *p;
+
+	path = skip_drive(path);
+	p = path + strlen(path);
+	if (p > path)
+		for (p--; p > path && IS_DIR_SEP(*p); p--)
+			*p = '\0';
+}
diff --git a/src/gtm/proxy/Makefile b/src/gtm/proxy/Makefile
new file mode 100644
index 0000000000..3ed6ccce13
--- /dev/null
+++ b/src/gtm/proxy/Makefile
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+
+top_build_dir=../..
+include $(top_build_dir)/gtm/Makefile.global
+
+OBJS=proxy_main.o proxy_thread.o  ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
+
+LIBS=-lpthread
+
+gtm_proxy:$(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_proxy
+
+all:gtm_proxy
+
+clean:
+	rm -f $(OBJS)
+	rm -f gtm_proxy
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
new file mode 100644
index 0000000000..75c7baf063
--- /dev/null
+++ b/src/gtm/proxy/proxy_main.c
@@ -0,0 +1,2016 @@
+/*-------------------------------------------------------------------------
+ *
+ * proxy_main.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_proxy.h"
+#include "gtm/elog.h"
+#include "gtm/memutils.h"
+#include "gtm/gtm_list.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-be.h"
+#include "gtm/libpq-fe.h"
+#include "gtm/pqsignal.h"
+#include "gtm/pqformat.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/gtm_seq.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-int.h"
+
+extern int	optind;
+extern char *optarg;
+
+#define GTM_MAX_PATH			1024
+#define GTM_PROXY_DEFAULT_HOSTNAME	"*"
+#define GTM_PROXY_DEFAULT_PORT		6666
+#define GTM_PROXY_DEFAULT_WORKERS	2
+#define GTM_PID_FILE			"gtm_proxy.pid"
+#define GTM_LOG_FILE			"gtm_proxy.log"
+
+static char *progname = "gtm_proxy";
+char	   *ListenAddresses;
+int			GTMProxyPortNumber;
+int			GTMProxyWorkerThreads;
+char		*GTMProxyDataDir;
+
+char		*GTMServerHost;
+int			GTMServerPortNumber;
+
+/* The socket(s) we're listening to. */
+#define MAXLISTEN	64
+static int	ListenSocket[MAXLISTEN];
+
+pthread_key_t	threadinfo_key;
+static bool		GTMProxyAbortPending = false;
+
+static Port *ConnCreate(int serverFd);
+static void ConnFree(Port *conn);
+static int ServerLoop(void);
+static int initMasks(fd_set *rmask);
+void *GTMProxy_ThreadMain(void *argp);
+static int GTMProxyAddConnection(Port *port);
+static int ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf);
+static void GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo);
+static void GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn);
+
+static void GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo,
+		GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+
+static void ProcessCommand(GTMProxy_ConnectionInfo *conninfo,
+		GTM_Conn *gtm_conn, StringInfo input_message);
+static void ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo,
+		GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo,
+		GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo,
+	   	GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+static void ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo,
+	   	GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message);
+
+static void GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo,
+		GTM_CoordinatorId coordinator_id);
+static void GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo,
+		GTM_CoordinatorId coordinator_id);
+
+static void ProcessResponse(GTMProxy_ThreadInfo *thrinfo,
+		GTMProxy_CommandInfo *cmdinfo, GTM_Result *res);
+
+static void GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo);
+static void GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo,
+		GTM_MessageType mtype, GTMProxy_CommandData cmd_data);
+
+static bool CreateOptsFile(int argc, char *argv[]);
+static void CreateDataDirLockFile(void);
+static void CreateLockFile(const char *filename, const char *refName);
+static void ChangeToDataDir(void);
+static void checkDataDir(void);
+static void DeleteLockFile(const char *filename);
+
+/*
+ * One-time initialization. It's called immediately after the main process
+ * starts
+ */ 
+static GTMProxy_ThreadInfo *
+MainThreadInit()
+{
+	GTMProxy_ThreadInfo *thrinfo;
+
+	pthread_key_create(&threadinfo_key, NULL);
+	
+	/*
+	 * Initialize the lock protecting the global threads info
+	 */
+	GTM_RWLockInit(&GTMProxyThreads->gt_lock);
+
+	/*
+	 * We are called even before memory context management is setup. We must
+	 * use malloc
+	 */
+	thrinfo = (GTMProxy_ThreadInfo *)malloc(sizeof (GTMProxy_ThreadInfo));
+
+	if (thrinfo == NULL)
+	{
+		fprintf(stderr, "malloc failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+
+	if (SetMyThreadInfo(thrinfo))
+	{
+		fprintf(stderr, "SetMyThreadInfo failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+
+	return thrinfo;
+}
+
+static void
+BaseInit()
+{
+	GTMProxy_ThreadInfo *thrinfo;
+
+	thrinfo = MainThreadInit();
+
+	MyThreadID = pthread_self();
+
+	MemoryContextInit();
+
+	checkDataDir();
+	ChangeToDataDir();
+	CreateDataDirLockFile();
+
+	if (GTMLogFile == NULL)
+	{
+		GTMLogFile = (char *) malloc(GTM_MAX_PATH);
+		sprintf(GTMLogFile, "%s/%s", GTMProxyDataDir, GTM_LOG_FILE);
+	}
+
+	DebugFileOpen();
+
+	/*
+	 * The memory context is now set up.
+	 * Add the thrinfo structure in the global array
+	 */
+	if (GTMProxy_ThreadAdd(thrinfo) == -1)
+	{
+		fprintf(stderr, "GTMProxy_ThreadAdd for main thread failed: %d", errno);
+		fflush(stdout);
+		fflush(stderr);
+	}
+}
+
+static void
+GTMProxy_SigleHandler(int signal)
+{
+	fprintf(stderr, "Received signal %d", signal);
+
+	switch (signal)
+	{
+		case SIGKILL:
+		case SIGTERM:
+		case SIGQUIT:
+		case SIGINT:
+		case SIGHUP:
+			break;
+
+		default:
+			fprintf(stderr, "Unknown signal %d\n", signal);
+			return;
+	}
+
+	/*
+	 * XXX We should do a clean shutdown here.
+	 */
+	/* Delete pid file before shutting down */
+	DeleteLockFile(GTM_PID_FILE);
+
+	PG_SETMASK(&BlockSig);
+	GTMProxyAbortPending = true;
+
+	return;
+}
+
+/*
+ * Help display should match 
+ */
+static void
+help(const char *progname)
+{
+	printf(_("This is the GTM proxy.\n\n"));
+	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -h hostname     GTM proxy hostname/IP\n"));
+	printf(_("  -p port			GTM proxy port number\n"));
+	printf(_("  -s hostname		GTM server hostname/IP \n"));
+	printf(_("  -t port			GTM server port number\n"));
+	printf(_("  -n count		Number of worker threads\n"));
+	printf(_("  -D directory	GTM proxy working directory\n"));
+	printf(_("  -l filename		GTM proxy log file name \n"));
+	printf(_("  --help          show this help, then exit\n"));
+}
+
+
+int
+main(int argc, char *argv[])
+{
+	int			opt;
+	int			status;
+	int			i;
+
+	/*
+	 * Catch standard options before doing much else
+	 */
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			help(argv[0]);
+			exit(0);
+		}
+	}
+
+	ListenAddresses = GTM_PROXY_DEFAULT_HOSTNAME;
+	GTMProxyPortNumber = GTM_PROXY_DEFAULT_PORT;
+	GTMProxyWorkerThreads = GTM_PROXY_DEFAULT_WORKERS;
+	
+	/*
+	 * Parse the command like options and set variables
+	 */
+	while ((opt = getopt(argc, argv, "h:p:n:D:l:s:t:")) != -1)
+	{
+		switch (opt)
+		{
+			case 'h':
+				/* Listen address of the proxy */
+				ListenAddresses = strdup(optarg);
+				break;
+
+			case 'p':
+				/* Port number for the proxy to listen on */
+				GTMProxyPortNumber = atoi(optarg);
+				break;
+
+			case 'n':
+				/* Number of worker threads */
+				GTMProxyWorkerThreads = atoi(optarg);
+				break;
+
+			case 'D':
+				GTMProxyDataDir = strdup(optarg);
+				canonicalize_path(GTMProxyDataDir);
+				break;
+
+			case 'l':
+				/* The log file */
+				GTMLogFile = strdup(optarg);
+				break;
+
+			case 's':
+				/* GTM server host name */
+				GTMServerHost = strdup(optarg);
+				break;
+
+			case 't':
+				/* GTM server port number */
+				GTMServerPortNumber = atoi(optarg);
+				break;
+
+			default:
+				write_stderr("Try \"%s --help\" for more information.\n",
+							 progname);
+		}
+	}
+
+	if (GTMProxyDataDir == NULL)
+	{
+		write_stderr("GTM Proxy data directory must be specified\n");
+		write_stderr("Try \"%s --help\" for more information.\n",
+					 progname);
+		exit(1);
+	}
+	/*
+	 * GTM accepts no non-option switch arguments.
+	 */
+	if (optind < argc)
+	{
+		write_stderr("%s: invalid argument: \"%s\"\n",
+					 progname, argv[optind]);
+		write_stderr("Try \"%s --help\" for more information.\n",
+					 progname);
+		exit(1);
+	}
+
+	/*
+	 * Some basic initialization must happen before we do anything
+	 * useful
+	 */
+	BaseInit();
+
+	elog(DEBUG3, "Starting GTM proxy at (%s:%d)", ListenAddresses, GTMProxyPortNumber);
+
+	/*
+	 * Establish input sockets.
+	 */
+	for (i = 0; i < MAXLISTEN; i++)
+		ListenSocket[i] = -1;
+
+	if (ListenAddresses)
+	{
+		int			success = 0;
+
+			status = StreamServerPort(AF_UNSPEC, ListenAddresses,
+									  (unsigned short) GTMProxyPortNumber,
+									  ListenSocket, MAXLISTEN);
+		if (status == STATUS_OK)
+			success++;
+		else
+			ereport(FATAL,
+					(errmsg("could not create listen socket for \"%s\"",
+							ListenAddresses)));
+	}
+
+	/*
+	 * check that we have some socket to listen on
+	 */
+	if (ListenSocket[0] == -1)
+		ereport(FATAL,
+				(errmsg("no socket created for listening")));
+
+	/*
+	 * Record gtm proxy options.  We delay this till now to avoid recording
+	 * bogus options
+	 */
+	if (!CreateOptsFile(argc, argv))
+		exit(1);
+
+	pqsignal(SIGHUP, GTMProxy_SigleHandler);
+	pqsignal(SIGKILL, GTMProxy_SigleHandler);
+	pqsignal(SIGQUIT, GTMProxy_SigleHandler);
+	pqsignal(SIGTERM, GTMProxy_SigleHandler);
+	pqsignal(SIGINT, GTMProxy_SigleHandler);
+
+	pqinitmask();
+
+	/*
+	 * Pre-fork so many worker threads
+	 */
+
+	for (i = 0; i < GTMProxyWorkerThreads; i++)
+	{
+		/*
+		 * XXX Start the worker thread
+		 */
+		if (GTMProxy_ThreadCreate(GTMProxy_ThreadMain) == NULL)
+		{
+			elog(ERROR, "failed to create a new thread");
+			return STATUS_ERROR;
+		}
+	}
+
+	/*
+	 * Accept any new connections. Add for each incoming connection to one of
+	 * the pre-forked threads.
+	 */
+	status = ServerLoop();
+
+	/*
+	 * ServerLoop probably shouldn't ever return, but if it does, close down.
+	 */
+	exit(status != STATUS_OK);
+
+	return 0;					/* not reached */
+}
+
+/*
+ * ConnCreate -- create a local connection data structure
+ */
+static Port *
+ConnCreate(int serverFd)
+{
+	Port	   *port;
+
+	if (!(port = (Port *) calloc(1, sizeof(Port))))
+	{
+		ereport(LOG,
+				(ENOMEM,
+				 errmsg("out of memory")));
+		exit(1);
+	}
+
+	if (StreamConnection(serverFd, port) != STATUS_OK)
+	{
+		if (port->sock >= 0)
+			StreamClose(port->sock);
+		ConnFree(port);
+		port = NULL;
+	}
+
+	port->conn_id = InvalidGTMProxyConnID;
+
+	return port;
+}
+
+/*
+ * ConnFree -- free a local connection data structure
+ */
+static void
+ConnFree(Port *conn)
+{
+	free(conn);
+}
+
+/*
+ * Main idle loop of postmaster
+ */
+static int
+ServerLoop(void)
+{
+	fd_set		readmask;
+	int			nSockets;
+
+	nSockets = initMasks(&readmask);
+
+	for (;;)
+	{
+		fd_set		rmask;
+		int			selres;
+
+		/*
+		 * Wait for a connection request to arrive.
+		 *
+		 * We wait at most one minute, to ensure that the other background
+		 * tasks handled below get done even when no requests are arriving.
+		 */
+		memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+		PG_SETMASK(&UnBlockSig);
+
+		if (GTMProxyAbortPending)
+		{
+			/*
+			 * Tell everybody that we are shutting down
+			 * 
+			 * !! TODO
+			 */
+			exit(1);
+		}
+
+		{
+			/* must set timeout each time; some OSes change it! */
+			struct timeval timeout;
+
+			timeout.tv_sec = 60;
+			timeout.tv_usec = 0;
+
+			selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+		}
+
+		/*
+		 * Block all signals until we wait again.  (This makes it safe for our
+		 * signal handlers to do nontrivial work.)
+		 */
+		PG_SETMASK(&BlockSig);
+
+		/* Now check the select() result */
+		if (selres < 0)
+		{
+			if (errno != EINTR && errno != EWOULDBLOCK)
+			{
+				ereport(LOG,
+						(EACCES,
+						 errmsg("select() failed in postmaster: %m")));
+				return STATUS_ERROR;
+			}
+		}
+
+		/*
+		 * New connection pending on any of our sockets? If so, accept the
+		 * connection and add it to one of the worker threads.
+		 */
+		if (selres > 0)
+		{
+			int			i;
+
+			for (i = 0; i < MAXLISTEN; i++)
+			{
+				if (ListenSocket[i] == -1)
+					break;
+				if (FD_ISSET(ListenSocket[i], &rmask))
+				{
+					Port	   *port;
+
+					port = ConnCreate(ListenSocket[i]);
+					if (port)
+					{
+						if (GTMProxyAddConnection(port) != STATUS_OK)
+						{
+							elog(ERROR, "Too many connections");
+							StreamClose(port->sock);
+							ConnFree(port);
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Initialise the masks for select() for the ports we are listening on.
+ * Return the number of sockets to listen on.
+ */
+static int
+initMasks(fd_set *rmask)
+{
+	int			maxsock = -1;
+	int			i;
+
+	FD_ZERO(rmask);
+
+	for (i = 0; i < MAXLISTEN; i++)
+	{
+		int			fd = ListenSocket[i];
+
+		if (fd == -1)
+			break;
+		FD_SET(fd, rmask);
+		if (fd > maxsock)
+			maxsock = fd;
+	}
+
+	return maxsock + 1;
+}
+
+/*
+ * The main worker thread routine
+ */
+void *
+GTMProxy_ThreadMain(void *argp)
+{
+	GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+	int qtype;
+	StringInfoData input_message;
+	sigjmp_buf  local_sigjmp_buf;
+	int32 saved_seqno = -1;
+	int ii, nrfds;
+	char gtm_connect_string[1024];
+
+	elog(DEBUG3, "Starting the connection helper thread");
+	
+
+	/*
+	 * Create the memory context we will use in the main loop.
+	 *
+	 * MessageContext is reset once per iteration of the main loop, ie, upon
+	 * completion of processing of each command message from the client.
+	 *
+	 * This context is thread-specific
+	 */
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_MINSIZE,
+										   ALLOCSET_DEFAULT_INITSIZE,
+										   ALLOCSET_DEFAULT_MAXSIZE,
+										   false);
+	
+	/*
+	 * Set up connection with the GTM server
+	 */
+	sprintf(gtm_connect_string, "host=%s port=%d coordinator_id=1 proxy=1",
+			GTMServerHost, GTMServerPortNumber);
+
+	thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string);
+
+	if (thrinfo->thr_gtm_conn == NULL)
+		elog(FATAL, "GTM connection failed");
+
+	/*
+	 * Get the input_message in the TopMemoryContext so that we don't need to
+	 * free/palloc it for every incoming message. Unlike Postgres, we don't
+	 * expect the incoming messages to be of arbitrary sizes
+	 */
+
+	initStringInfo(&input_message);
+
+	/*
+	 * If an exception is encountered, processing resumes here so we abort the
+	 * current transaction and start a new one.
+	 *
+	 * You might wonder why this isn't coded as an infinite loop around a
+	 * PG_TRY construct.  The reason is that this is the bottom of the
+	 * exception stack, and so with PG_TRY there would be no exception handler
+	 * in force at all during the CATCH part.  By leaving the outermost setjmp
+	 * always active, we have at least some chance of recovering from an error
+	 * during error recovery.  (If we get into an infinite loop thereby, it
+	 * will soon be stopped by overflow of elog.c's internal state stack.)
+	 */
+
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/*
+		 * NOTE: if you are tempted to add more code in this if-block,
+		 * consider the high probability that it should be in
+		 * AbortTransaction() instead.	The only stuff done directly here
+		 * should be stuff that is guaranteed to apply *only* for outer-level
+		 * error recovery, such as adjusting the FE/BE protocol status.
+		 */
+
+		/* Report the error to the client and/or server log */
+		if (thrinfo->thr_conn_count > 0)
+		{
+			for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+			{
+				GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+				/*
+				 * Now clean up disconnected connections
+				 */
+				if (conninfo->con_disconnected)
+				{
+					GTMProxy_ThreadRemoveConnection(thrinfo, conninfo);
+					pfree(conninfo);
+					ii--;
+				}
+				else
+				{
+					/*
+					 * Consume all the pending data on this connection and send
+					 * error report
+					 */
+					if (conninfo->con_pending_msg != MSG_TYPE_INVALID)
+					{
+						conninfo->con_port->PqRecvPointer = conninfo->con_port->PqRecvLength = 0;
+						conninfo->con_pending_msg = MSG_TYPE_INVALID;
+						EmitErrorReport(conninfo->con_port);
+					}
+				}
+			}
+		}
+		else
+			EmitErrorReport(NULL);
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(TopMemoryContext);
+		FlushErrorState();
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+	for (;;)
+	{
+		ListCell *elem = NULL;
+		GTM_Result *res = NULL;
+
+		/*
+		 * Release storage left over from prior query cycle, and create a new
+		 * query input buffer in the cleared MessageContext.
+		 */
+		MemoryContextSwitchTo(MessageContext);
+		MemoryContextResetAndDeleteChildren(MessageContext);
+
+		/*
+		 * Just reset the input buffer to avoid repeated palloc/pfrees
+		 *
+		 * XXX We should consider resetting the MessageContext periodically to
+		 * handle any memory leaks
+		 */
+		resetStringInfo(&input_message);
+
+		/*
+		 * Check if there are any changes to the connection array assigned to
+		 * this thread. If so, we need to rebuild the fd array.
+		 */
+		GTM_MutexLockAcquire(&thrinfo->thr_lock);
+		if (saved_seqno != thrinfo->thr_seqno)
+		{
+			saved_seqno = thrinfo->thr_seqno;
+
+			while (thrinfo->thr_conn_count <= 0)
+			{
+				/*
+				 * No connections assigned to the thread. Wait for at least one
+				 * connection to be assgined to us
+				 */
+				GTM_CVWait(&thrinfo->thr_cv, &thrinfo->thr_lock);
+			}
+
+			memset(thrinfo->thr_poll_fds, 0, sizeof (thrinfo->thr_poll_fds));
+
+			/*
+			 * Now grab all the open connections. We are holding the lock so no
+			 * new connections can be added.
+			 */
+			for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+			{
+				GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+
+				/* We detect if the connection has been dropped to avoid
+				 * a segmentation fault. 
+				*/
+				if (conninfo->con_port == NULL)
+				{
+					conninfo->con_disconnected = true;
+					continue;
+				} 
+
+				/*
+				 * If this is a newly added connection, complete the handshake
+				 */
+				if (!conninfo->con_authenticated)
+					GTMProxy_HandshakeConnection(conninfo);
+
+				thrinfo->thr_poll_fds[ii].fd = conninfo->con_port->sock;
+				thrinfo->thr_poll_fds[ii].events = POLLIN;
+				thrinfo->thr_poll_fds[ii].revents = 0;
+			}
+		}
+		GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+		while (true)
+		{
+			nrfds = poll(thrinfo->thr_poll_fds, thrinfo->thr_conn_count, 1000);
+
+			if (nrfds < 0)
+			{
+				if (errno == EINTR)
+					continue;
+				elog(FATAL, "poll returned with error %d", nrfds);
+			}
+			else
+				break;
+		}
+
+		if (nrfds == 0)
+			continue;
+
+		/*
+		 * Initialize the lists
+		 */
+		thrinfo->thr_processed_commands = NIL;
+		memset(thrinfo->thr_pending_commands, 0, sizeof (thrinfo->thr_pending_commands));
+
+		/*
+		 * Now, read command from each of the connections that has some data to
+		 * be read.
+		 */
+		for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+		{
+			GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+			thrinfo->thr_conn = conninfo;
+
+			if (thrinfo->thr_poll_fds[ii].revents & POLLHUP)
+			{
+				/*
+				 * The fd has become invalid. The connection is broken. Add it
+				 * to the remove_list and cleanup at the end of this round of
+				 * cleanup.
+				 */
+				GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+				continue;
+			}
+
+			if (thrinfo->thr_poll_fds[ii].revents & POLLIN)
+			{
+				/*
+				 * (3) read a command (loop blocks here)
+				 */
+				qtype = ReadCommand(thrinfo->thr_conn, &input_message);
+
+				switch(qtype)
+				{
+					case 'C':
+						ProcessCommand(thrinfo->thr_conn, thrinfo->thr_gtm_conn,
+								&input_message);
+						break;
+					
+					case 'X':
+					case EOF:
+						/*
+						 * Connection termination request
+						 *
+						 * Close the socket and remember the connection
+						 * as disconnected. All such connections will be
+						 * removed after the command processing is over. We
+						 * can't remove it just yet because we pass the slot id
+						 * to the server to quickly find the backend connection
+						 * while processing proxied messages.
+						 */
+						GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+						break;
+					default:
+						/*
+						 * Also disconnect if protocol error
+						 */
+						GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+						elog(ERROR, "Unexpected message, or client disconnected abruptly.");
+						break;
+				}
+
+			}
+		}
+
+		/*
+		 * Ok. All the commands are processed. Commands which can be proxied
+		 * directly have been already sent to the GTM server. Now, group the
+		 * remaining commands, send them to the server and flush the data.
+		 */
+		GTMProxy_ProcessPendingCommands(thrinfo);
+
+		/*
+		 * Add a special marker to tell the GTM server that we are done with
+		 * one round of messages and the GTM server should flush all the
+		 * pending responses after seeing this message.
+		 */
+		if (gtmpqPutMsgStart('F', true, thrinfo->thr_gtm_conn) ||
+			gtmpqPutInt(MSG_DATA_FLUSH, sizeof (GTM_MessageType), thrinfo->thr_gtm_conn) ||
+			gtmpqPutMsgEnd(thrinfo->thr_gtm_conn))
+			elog(ERROR, "Error sending flush message");
+
+		/*
+		 * Make sure everything is on wire now
+		 */
+		gtmpqFlush(thrinfo->thr_gtm_conn);
+
+		/*
+		 * Read back the responses and put them on to the right backend
+		 * connection.
+		 */
+		foreach(elem, thrinfo->thr_processed_commands)
+		{
+			GTMProxy_CommandInfo *cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+
+			/*
+			 * If this is a continuation of a multi-part command response, we
+			 * don't need to read another result from the stream. The previous
+			 * result contains our response and we should just read from it.
+			 */
+			if (cmdinfo->ci_res_index == 0)
+			{
+				if ((res = GTMPQgetResult(thrinfo->thr_gtm_conn)) == NULL)
+					elog(ERROR, "GTMPQgetResult failed");
+			}
+
+			ProcessResponse(thrinfo, cmdinfo, res);
+		}
+
+		list_free_deep(thrinfo->thr_processed_commands);
+		thrinfo->thr_processed_commands = NIL;
+
+		/*
+		 * Now clean up disconnected connections
+		 */
+		for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+		{
+			GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii];
+			if (conninfo->con_disconnected)
+			{
+				GTMProxy_ThreadRemoveConnection(thrinfo, conninfo);
+				pfree(conninfo);
+				ii--;
+			}
+		}
+	}
+
+	/* can't get here because the above loop never exits */
+	Assert(false);
+
+	return thrinfo;
+}
+
+/*
+ * Add the accepted connection to the pool
+ */
+static int
+GTMProxyAddConnection(Port *port)
+{
+	GTMProxy_ConnectionInfo *conninfo = NULL;
+
+	conninfo = (GTMProxy_ConnectionInfo *)palloc0(sizeof (GTMProxy_ConnectionInfo));
+
+	if (conninfo == NULL)
+	{
+		ereport(ERROR,
+				(ENOMEM,
+				 	errmsg("Out of memory")));
+		return STATUS_ERROR;
+	}
+		
+	elog(DEBUG3, "Started new connection");
+	conninfo->con_port = port;
+
+	/*
+	 * Add the conninfo struct to the next worker thread in round-robin manner
+	 */
+	GTMProxy_ThreadAddConnection(conninfo);
+
+	return STATUS_OK;
+}
+
+void
+ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		StringInfo input_message)
+{
+	GTM_MessageType mtype;
+
+	mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType));
+
+	switch (mtype)
+	{
+		case MSG_UNREGISTER_COORD:	
+			ProcessCoordinatorCommand(conninfo, gtm_conn, mtype, input_message);
+			break;
+
+		case MSG_TXN_BEGIN:	
+		case MSG_TXN_BEGIN_GETGXID:	
+		case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+		case MSG_TXN_PREPARE:		
+		case MSG_TXN_COMMIT:		
+		case MSG_TXN_ROLLBACK:		
+		case MSG_TXN_GET_GXID:
+			ProcessTransactionCommand(conninfo, gtm_conn, mtype, input_message);
+			break;
+
+		case MSG_SNAPSHOT_GET:		
+		case MSG_SNAPSHOT_GXID_GET:
+			ProcessSnapshotCommand(conninfo, gtm_conn, mtype, input_message);
+			break;
+
+		case MSG_SEQUENCE_INIT:	
+		case MSG_SEQUENCE_GET_CURRENT:
+		case MSG_SEQUENCE_GET_NEXT:
+		case MSG_SEQUENCE_RESET:
+		case MSG_SEQUENCE_CLOSE:
+			ProcessSeqeunceCommand(conninfo, gtm_conn, mtype, input_message);
+			break;
+
+		default:
+			ereport(FATAL,
+					(EPROTO,
+					 errmsg("invalid frontend message type %d",
+							mtype)));
+	}
+
+	conninfo->con_pending_msg = mtype;
+}
+
+static void
+ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo,
+		GTM_Result *res)
+{
+	StringInfoData buf;
+	GlobalTransactionId gxid;
+
+	switch (cmdinfo->ci_mtype)
+	{
+		case MSG_TXN_BEGIN_GETGXID:
+			/*
+			 * This is a grouped command. We send just the transaction count to
+			 * the GTM server which responds back with the start GXID. We
+			 * derive our GXID from the start GXID and the our position in the
+			 * command queue
+			 */
+			if (res->gr_status == 0)
+			{
+				if (res->gr_type != TXN_BEGIN_GETGXID_MULTI_RESULT)
+					elog(ERROR, "Wrong result");
+				if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_get_multi.txn_count)
+					elog(ERROR, "Too few GXIDs");
+
+				gxid = res->gr_resdata.grd_txn_get_multi.start_gxid + cmdinfo->ci_res_index;
+
+				/* Handle wraparound */
+				if (gxid < res->gr_resdata.grd_txn_get_multi.start_gxid)
+					gxid += FirstNormalGlobalTransactionId;
+
+				pq_beginmessage(&buf, 'S');
+				pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4);
+				pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId));
+				pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+				pq_flush(cmdinfo->ci_conn->con_port);
+			}
+			else
+			{
+				pq_beginmessage(&buf, 'E');
+				pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+				pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+				pq_flush(cmdinfo->ci_conn->con_port);
+			}
+			cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+			break;
+
+		case MSG_TXN_COMMIT:
+			if (res->gr_type != TXN_COMMIT_MULTI_RESULT)
+				elog(ERROR, "Wrong result");
+			/*
+			 * These are grouped messages. We send an array of GXIDs to commit
+			 * or rollback and the server sends us back an array of status
+			 * codes.
+			 */
+			if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count)
+				elog(ERROR, "Too few GXIDs");
+
+			if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+			{
+				pq_beginmessage(&buf, 'S');
+				pq_sendint(&buf, TXN_COMMIT_RESULT, 4);
+				pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId));
+				pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+				pq_flush(cmdinfo->ci_conn->con_port);
+			}
+			else
+				ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed")));
+			cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+			break;
+
+		case MSG_TXN_ROLLBACK:
+			if (res->gr_type != TXN_ROLLBACK_MULTI_RESULT)
+				elog(ERROR, "Wrong result");
+			/*
+			 * These are grouped messages. We send an array of GXIDs to commit
+			 * or rollback and the server sends us back an array of status
+			 * codes.
+			 */
+			if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count)
+				elog(ERROR, "Too few GXIDs");
+
+			if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+			{
+				pq_beginmessage(&buf, 'S');
+				pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4);
+				pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId));
+				pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+				pq_flush(cmdinfo->ci_conn->con_port);
+			}
+			else
+				ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed")));
+			cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+			break;
+
+		case MSG_SNAPSHOT_GET:
+			if ((res->gr_type != SNAPSHOT_GET_RESULT) &&
+				(res->gr_type != SNAPSHOT_GET_MULTI_RESULT))
+				elog(ERROR, "Wrong result");
+
+			if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_snap_multi.txn_count)
+				elog(ERROR, "Too few GXIDs");
+
+			if (res->gr_resdata.grd_txn_snap_multi.status[cmdinfo->ci_res_index] == STATUS_OK)
+			{
+				int txn_count = 1;
+				int status = STATUS_OK;
+
+				pq_beginmessage(&buf, 'S');
+				pq_sendint(&buf, SNAPSHOT_GET_RESULT, 4);
+				pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_snap.gxid, sizeof (GlobalTransactionId));
+				pq_sendbytes(&buf, (char *)&txn_count, sizeof (txn_count));
+				pq_sendbytes(&buf, (char *)&status, sizeof (status));
+				pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmin, sizeof (GlobalTransactionId));
+				pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmax, sizeof (GlobalTransactionId));
+				pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_recent_global_xmin, sizeof (GlobalTransactionId));
+				pq_sendint(&buf, res->gr_snapshot.sn_xcnt, sizeof (int));
+				pq_sendbytes(&buf, (char *)res->gr_snapshot.sn_xip,
+							 sizeof(GlobalTransactionId) * res->gr_snapshot.sn_xcnt);
+				pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+				pq_flush(cmdinfo->ci_conn->con_port);
+			}
+			else
+				ereport(ERROR2, (EINVAL, errmsg("snapshot request failed")));
+			cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+			break;
+
+		case MSG_TXN_BEGIN:	
+		case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+		case MSG_TXN_PREPARE:		
+		case MSG_TXN_GET_GXID:
+		case MSG_SNAPSHOT_GXID_GET:
+		case MSG_SEQUENCE_INIT:	
+		case MSG_SEQUENCE_GET_CURRENT:
+		case MSG_SEQUENCE_GET_NEXT:
+		case MSG_SEQUENCE_RESET:
+		case MSG_SEQUENCE_CLOSE:
+			if ((res->gr_proxyhdr.ph_conid == InvalidGTMProxyConnID) ||
+				(res->gr_proxyhdr.ph_conid >= GTM_PROXY_MAX_CONNECTIONS) ||
+				(thrinfo->thr_all_conns[res->gr_proxyhdr.ph_conid] != cmdinfo->ci_conn))
+				elog(PANIC, "Invalid response or synchronization loss");
+
+			/*
+			 * These are just proxied messages.. so just forward the response
+			 * back after stripping the conid part.
+			 *
+			 * !!TODO As we start adding support for message grouping for
+			 * messages, those message types would be removed from the above
+			 * and handled separately. 
+			 */
+			switch (res->gr_status)
+			{
+				case 0:
+					pq_beginmessage(&buf, 'S');
+					pq_sendint(&buf, res->gr_type, 4);
+					pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+					pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+					pq_flush(cmdinfo->ci_conn->con_port);
+					break;
+
+				default:
+					pq_beginmessage(&buf, 'E');
+					pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen);
+					pq_endmessage(cmdinfo->ci_conn->con_port, &buf);
+					pq_flush(cmdinfo->ci_conn->con_port);
+					break;
+			}
+			cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID;
+			break;
+
+		default:
+			ereport(FATAL,
+					(EPROTO,
+					 errmsg("invalid frontend message type %d",
+							cmdinfo->ci_mtype)));
+	}
+
+
+}
+
+/* ----------------
+ *		ReadCommand reads a command from either the frontend or
+ *		standard input, places it in inBuf, and returns the
+ *		message type code (first byte of the message).
+ *		EOF is returned if end of file.
+ * ----------------
+ */
+static int
+ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf)
+{
+	int 			qtype;
+
+	/*
+	 * Get message type code from the frontend.
+	 */
+	qtype = pq_getbyte(conninfo->con_port);
+
+	if (qtype == EOF)			/* frontend disconnected */
+	{
+		ereport(COMMERROR,
+				(EPROTO,
+				 errmsg("unexpected EOF on client connection")));
+		return qtype;
+	}
+
+	/*
+	 * Validate message type code before trying to read body; if we have lost
+	 * sync, better to say "command unknown" than to run out of memory because
+	 * we used garbage as a length word.
+	 *
+	 * This also gives us a place to set the doing_extended_query_message flag
+	 * as soon as possible.
+	 */
+	switch (qtype)
+	{
+		case 'C':
+			break;
+
+		case 'X':
+			break;
+
+		default:
+
+			/*
+			 * Otherwise we got garbage from the frontend.	We treat this as
+			 * fatal because we have probably lost message boundary sync, and
+			 * there's no good way to recover.
+			 */
+			ereport(ERROR,
+					(EPROTO,
+					 errmsg("invalid frontend message type %d", qtype)));
+
+			break;
+	}
+
+	/*
+	 * In protocol version 3, all frontend messages have a length word next
+	 * after the type code; we can read the message contents independently of
+	 * the type.
+	 */
+	if (pq_getmessage(conninfo->con_port, inBuf, 0))
+		return EOF;			/* suitable message already logged */
+
+	return qtype;
+}
+
+static void
+ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		GTM_MessageType mtype, StringInfo message)
+{
+	GTM_CoordinatorId cid;
+
+	cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId));
+	
+	switch (mtype)
+	{
+		case MSG_UNREGISTER_COORD:
+			GTMProxy_UnregisterCoordinator(conninfo, cid);
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+	pq_getmsgend(message);
+}
+
+static void
+ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		GTM_MessageType mtype, StringInfo message)
+{
+	GTMProxy_CommandData cmd_data;
+
+	switch (mtype)
+	{
+		case MSG_TXN_BEGIN_GETGXID:	
+			cmd_data.cd_beg.iso_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel));
+			cmd_data.cd_beg.rdonly = pq_getmsgbyte(message);
+			GTMProxy_CommandPending(conninfo, mtype, cmd_data);	
+			break;
+
+		case MSG_TXN_COMMIT:		
+		case MSG_TXN_ROLLBACK:
+			cmd_data.cd_rc.isgxid = pq_getmsgbyte(message);
+			if (cmd_data.cd_rc.isgxid)
+			{
+				const char *data = pq_getmsgbytes(message,
+						sizeof (GlobalTransactionId));
+				if (data == NULL)
+					ereport(ERROR,
+							(EPROTO,
+							 errmsg("Message does not contain valid GXID")));
+				memcpy(&cmd_data.cd_rc.gxid, data, sizeof (GlobalTransactionId));
+			}
+			else
+			{
+				const char *data = pq_getmsgbytes(message,
+						sizeof (GTM_TransactionHandle));
+				if (data == NULL)
+					ereport(ERROR,
+							(EPROTO,
+							 errmsg("Message does not contain valid Transaction Handle")));
+				memcpy(&cmd_data.cd_rc.handle, data, sizeof (GTM_TransactionHandle));
+			}
+			pq_getmsgend(message);
+			GTMProxy_CommandPending(conninfo, mtype, cmd_data);	
+			break;
+
+		case MSG_TXN_BEGIN:	
+		case MSG_TXN_GET_GXID:
+			elog(FATAL, "Support not yet added for these message types");
+			break;
+
+		case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
+		case MSG_TXN_PREPARE:		
+			GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);	
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+}
+
+static void
+ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		GTM_MessageType mtype, StringInfo message)
+{
+	bool canbe_grouped = false;
+	GTMProxy_CommandData cmd_data;
+
+	switch (mtype)
+	{
+		case MSG_SNAPSHOT_GET:
+			canbe_grouped = pq_getmsgbyte(message);
+			if (!canbe_grouped)
+				GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+			else
+			{
+				cmd_data.cd_snap.isgxid = pq_getmsgbyte(message);
+				if (cmd_data.cd_snap.isgxid)
+				{
+					const char *data = pq_getmsgbytes(message,
+							sizeof (GlobalTransactionId));
+					if (data == NULL)
+						ereport(ERROR,
+								(EPROTO,
+								 errmsg("Message does not contain valid GXID")));
+					memcpy(&cmd_data.cd_snap.gxid, data, sizeof (GlobalTransactionId));
+				}
+				else
+				{
+					const char *data = pq_getmsgbytes(message,
+							sizeof (GTM_TransactionHandle));
+					if (data == NULL)
+						ereport(ERROR,
+								(EPROTO,
+								 errmsg("Message does not contain valid Transaction Handle")));
+					memcpy(&cmd_data.cd_snap.handle, data, sizeof (GTM_TransactionHandle));
+				}
+				pq_getmsgend(message);
+				GTMProxy_CommandPending(conninfo, mtype, cmd_data);	
+			}
+			break;
+
+		case MSG_SNAPSHOT_GXID_GET:
+			elog(ERROR, "Message not yet support");
+			break;
+
+		default:
+			Assert(0);			/* Shouldn't come here.. keep compiler quite */
+	}
+
+}
+
+static void
+ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		GTM_MessageType mtype, StringInfo message)
+{
+	/*
+	 * We proxy the Sequence messages as they are. Just add the connection
+	 * identifier to it so that the response can be quickly sent back to the
+	 * right backend.
+	 *
+	 * Write the message, but don't flush it just yet.
+	 */
+	return GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+}
+
+/*
+ * Proxy the incoming message to the GTM server after adding our own identifier
+ * to it. The rest of the message is forwarded as it is without even reading
+ * its contents.
+ */
+static void
+GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
+		GTM_MessageType mtype, StringInfo message)
+{
+	GTMProxy_CommandInfo *cmdinfo;
+	GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo;
+	GTM_ProxyMsgHeader proxyhdr;
+
+	proxyhdr.ph_conid = conninfo->con_id;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+		gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) ||
+		gtmpqPutInt(mtype, sizeof (GTM_MessageType), gtm_conn) ||
+		gtmpqPutnchar(pq_getmsgbytes(message, pq_getmsgunreadlen(message)),
+					  pq_getmsgunreadlen(message), gtm_conn))
+		elog(ERROR, "Error proxing data");
+
+	/*
+	 * Add the message to the pending command list
+	 */
+	cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo));
+	cmdinfo->ci_mtype = mtype;
+	cmdinfo->ci_conn = conninfo;
+	cmdinfo->ci_res_index = 0;
+	thrinfo->thr_processed_commands = lappend(thrinfo->thr_processed_commands, cmdinfo);
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(gtm_conn))
+		elog(ERROR, "Error finishing the message");
+
+	return;
+}
+
+
+/*
+ * Record the incoming message as per its type. After all messages of this type
+ * are collected, they will be sent in a single message to the GTM server.
+ */
+static void
+GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype,
+		GTMProxy_CommandData cmd_data)
+{
+	GTMProxy_CommandInfo *cmdinfo;
+	GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo;
+
+	/*
+	 * Add the message to the pending command list
+	 */
+	cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo));
+	cmdinfo->ci_mtype = mtype;
+	cmdinfo->ci_conn = conninfo;
+	cmdinfo->ci_res_index = 0;
+	cmdinfo->ci_data = cmd_data;
+	thrinfo->thr_pending_commands[mtype] = lappend(thrinfo->thr_pending_commands[mtype], cmdinfo);
+
+	return;
+}
+static void
+GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid)
+{
+	elog(DEBUG3, "Registering coordinator with cid %d", cid);
+	conninfo->con_port->coordinator_id = cid;
+}
+
+
+static void
+GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid)
+{
+	/*
+	 * Do a clean shutdown
+	 */
+	return;
+}
+
+
+static void
+GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo)
+{
+	/*
+	 * We expect a startup message at the very start. The message type is
+	 * REGISTER_COORD, followed by the 4 byte coordinator ID
+	 */
+	char startup_type;
+	GTM_StartupPacket sp;
+	StringInfoData inBuf;
+	StringInfoData buf;
+
+	startup_type = pq_getbyte(conninfo->con_port);
+
+	if (startup_type != 'A')
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("Expecting a startup message, but received %c",
+					 startup_type)));
+
+	initStringInfo(&inBuf);
+	
+	/*
+	 * All frontend messages have a length word next
+	 * after the type code; we can read the message contents independently of
+	 * the type.
+	 */
+	if (pq_getmessage(conninfo->con_port, &inBuf, 0))
+		ereport(ERROR,
+				(EPROTO,
+				 errmsg("Expecting coordinator ID, but received EOF")));
+
+	memcpy(&sp,
+		   pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)),
+		   sizeof (GTM_StartupPacket));
+	pq_getmsgend(&inBuf);
+
+	GTMProxy_RegisterCoordinator(conninfo, sp.sp_cid);
+
+	/*
+	 * Send a dummy authentication request message 'R' as the client
+	 * expects that in the current protocol
+	 */
+	pq_beginmessage(&buf, 'R');
+	pq_endmessage(conninfo->con_port, &buf);
+	pq_flush(conninfo->con_port);
+
+	conninfo->con_authenticated = true;
+
+	elog(DEBUG3, "Sent connection authentication message to the client");
+}
+
+static void
+GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn)
+{
+	GTM_ProxyMsgHeader proxyhdr;
+
+	conninfo->con_disconnected = true;
+	if (conninfo->con_port->sock > 0)
+		StreamClose(conninfo->con_port->sock);
+	ConnFree(conninfo->con_port);
+	conninfo->con_port = NULL;
+
+	proxyhdr.ph_conid = conninfo->con_id;
+
+	 /* Start the message. */
+	if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+		gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) ||
+		gtmpqPutInt(MSG_BACKEND_DISCONNECT, sizeof (GTM_MessageType), gtm_conn))
+		elog(ERROR, "Error proxing data");
+
+	/* Finish the message. */
+	if (gtmpqPutMsgEnd(gtm_conn))
+		elog(ERROR, "Error finishing the message");
+
+	return;
+}
+
+/*
+ * Proces all the pending messages now.
+ */
+static void
+GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo)
+{
+	int ii;
+	GTMProxy_CommandInfo *cmdinfo = NULL;
+	GTM_ProxyMsgHeader proxyhdr;
+	GTM_Conn *gtm_conn = thrinfo->thr_gtm_conn;
+	ListCell *elem = NULL;
+
+	for (ii = 0; ii < MSG_TYPE_COUNT; ii++)
+	{
+		int res_index = 0;
+
+		if (list_length(thrinfo->thr_pending_commands[ii]) == 0)
+			continue;
+
+		/*
+		 * Start a new group message and fill in the headers
+		 */
+		proxyhdr.ph_conid = InvalidGTMProxyConnID;
+
+		if (gtmpqPutMsgStart('C', true, gtm_conn) ||
+			gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn))
+			elog(ERROR, "Error proxing data");
+
+		switch (ii)
+		{
+			case MSG_TXN_BEGIN_GETGXID:
+				if (list_length(thrinfo->thr_pending_commands[ii]) <=0 )
+					elog(PANIC, "No pending commands of type %d", ii);
+
+				if (gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+					gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+					elog(ERROR, "Error sending data");
+				foreach (elem, thrinfo->thr_pending_commands[ii])
+				{
+					cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+					Assert(cmdinfo->ci_mtype == ii);
+					cmdinfo->ci_res_index = res_index++;
+					if (gtmpqPutInt(cmdinfo->ci_data.cd_beg.iso_level,
+								sizeof (GTM_IsolationLevel), gtm_conn) ||
+						gtmpqPutc(cmdinfo->ci_data.cd_beg.rdonly, gtm_conn) ||
+						gtmpqPutInt(cmdinfo->ci_conn->con_id, sizeof (GTMProxy_ConnID), gtm_conn))
+						elog(ERROR, "Error sending data");
+
+				}
+
+				/* Finish the message. */
+				if (gtmpqPutMsgEnd(gtm_conn))
+					elog(ERROR, "Error finishing the message");
+
+				/*
+				 * Move the entire list to the processed command
+				 */
+				thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+						thrinfo->thr_pending_commands[ii]);
+				thrinfo->thr_pending_commands[ii] = NIL;
+				break;
+
+			case MSG_TXN_COMMIT:
+				if (gtmpqPutInt(MSG_TXN_COMMIT_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+					gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+					elog(ERROR, "Error sending data");
+
+				foreach (elem, thrinfo->thr_pending_commands[ii])
+				{
+					cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+					Assert(cmdinfo->ci_mtype == ii);
+					cmdinfo->ci_res_index = res_index++;
+					if (cmdinfo->ci_data.cd_rc.isgxid)
+					{
+						if (gtmpqPutc(true, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+								sizeof (GlobalTransactionId), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+					else
+					{
+						if (gtmpqPutc(false, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+								sizeof (GTM_TransactionHandle), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+				}
+
+				/* Finish the message. */
+				if (gtmpqPutMsgEnd(gtm_conn))
+					elog(ERROR, "Error finishing the message");
+
+				/*
+				 * Move the entire list to the processed command
+				 */
+				thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+						thrinfo->thr_pending_commands[ii]);
+				thrinfo->thr_pending_commands[ii] = NIL;
+				break;
+
+				break;
+
+			case MSG_TXN_ROLLBACK:
+				if (gtmpqPutInt(MSG_TXN_ROLLBACK_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+					gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+					elog(ERROR, "Error sending data");
+
+				foreach (elem, thrinfo->thr_pending_commands[ii])
+				{
+					cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+					Assert(cmdinfo->ci_mtype == ii);
+					cmdinfo->ci_res_index = res_index++;
+					if (cmdinfo->ci_data.cd_rc.isgxid)
+					{
+						if (gtmpqPutc(true, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+								sizeof (GlobalTransactionId), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+					else
+					{
+						if (gtmpqPutc(false, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+								sizeof (GTM_TransactionHandle), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+				}
+
+				/* Finish the message. */
+				if (gtmpqPutMsgEnd(gtm_conn))
+					elog(ERROR, "Error finishing the message");
+
+
+				/*
+				 * Move the entire list to the processed command
+				 */
+				thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+						thrinfo->thr_pending_commands[ii]);
+				thrinfo->thr_pending_commands[ii] = NIL;
+				break;
+
+			case MSG_SNAPSHOT_GET:
+				if (gtmpqPutInt(MSG_SNAPSHOT_GET_MULTI, sizeof (GTM_MessageType), gtm_conn) ||
+					gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn))
+					elog(ERROR, "Error sending data");
+
+				foreach (elem, thrinfo->thr_pending_commands[ii])
+				{
+					cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem);
+					Assert(cmdinfo->ci_mtype == ii);
+					cmdinfo->ci_res_index = res_index++;
+					if (cmdinfo->ci_data.cd_rc.isgxid)
+					{
+						if (gtmpqPutc(true, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid,
+								sizeof (GlobalTransactionId), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+					else
+					{
+						if (gtmpqPutc(false, gtm_conn) ||
+							gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle,
+								sizeof (GTM_TransactionHandle), gtm_conn))
+							elog(ERROR, "Error sending data");
+					}
+				}
+
+				/* Finish the message. */
+				if (gtmpqPutMsgEnd(gtm_conn))
+					elog(ERROR, "Error finishing the message");
+
+				/*
+				 * Move the entire list to the processed command
+				 */
+				thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands,
+						thrinfo->thr_pending_commands[ii]);
+				thrinfo->thr_pending_commands[ii] = NIL;
+				break;
+
+
+			default:
+				elog(ERROR, "This message type (%d) can not be grouped together", ii);
+		}
+
+	}
+}
+
+/*
+ * Validate the proposed data directory
+ */
+static void
+checkDataDir(void)
+{
+	struct stat stat_buf;
+
+	Assert(GTMProxyDataDir);
+
+retry:
+	if (stat(GTMProxyDataDir, &stat_buf) != 0)
+	{
+		if (errno == ENOENT)
+		{
+			if (mkdir(GTMProxyDataDir, 0700) != 0)
+			{
+				ereport(FATAL,
+						(errno,
+						 errmsg("failed to create the directory \"%s\"",
+							 GTMProxyDataDir)));
+			}
+			goto retry;
+		}
+		else
+			ereport(FATAL,
+					(EPERM,
+				 errmsg("could not read permissions of directory \"%s\": %m",
+						GTMProxyDataDir)));
+	}
+
+	/* eventual chdir would fail anyway, but let's test ... */
+	if (!S_ISDIR(stat_buf.st_mode))
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("specified data directory \"%s\" is not a directory",
+						GTMProxyDataDir)));
+
+	/*
+	 * Check that the directory belongs to my userid; if not, reject.
+	 *
+	 * This check is an essential part of the interlock that prevents two
+	 * postmasters from starting in the same directory (see CreateLockFile()).
+	 * Do not remove or weaken it.
+	 *
+	 * XXX can we safely enable this check on Windows?
+	 */
+#if !defined(WIN32) && !defined(__CYGWIN__)
+	if (stat_buf.st_uid != geteuid())
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("data directory \"%s\" has wrong ownership",
+						GTMProxyDataDir),
+				 errhint("The server must be started by the user that owns the data directory.")));
+#endif
+}
+
+/*
+ * Change working directory to DataDir.  Most of the postmaster and backend
+ * code assumes that we are in DataDir so it can use relative paths to access
+ * stuff in and under the data directory.  For convenience during path
+ * setup, however, we don't force the chdir to occur during SetDataDir.
+ */
+static void
+ChangeToDataDir(void)
+{
+	if (chdir(GTMProxyDataDir) < 0)
+		ereport(FATAL,
+				(EINVAL,
+				 errmsg("could not change directory to \"%s\": %m",
+						GTMProxyDataDir)));
+}
+
+/*
+ * Create the data directory lockfile.
+ *
+ * When this is called, we must have already switched the working
+ * directory to DataDir, so we can just use a relative path.  This
+ * helps ensure that we are locking the directory we should be.
+ */
+static void
+CreateDataDirLockFile()
+{
+	CreateLockFile(GTM_PID_FILE, GTMProxyDataDir);
+}
+
+/*
+ * Create a lockfile.
+ *
+ * filename is the name of the lockfile to create.
+ * amPostmaster is used to determine how to encode the output PID.
+ * isDDLock and refName are used to determine what error message to produce.
+ */
+static void
+CreateLockFile(const char *filename, const char *refName)
+{
+	int			fd;
+	char		buffer[MAXPGPATH + 100];
+	int			ntries;
+	int			len;
+	int			encoded_pid;
+	pid_t		other_pid;
+	pid_t		my_pid = getpid();
+
+	/*
+	 * We need a loop here because of race conditions.	But don't loop forever
+	 * (for example, a non-writable $PGDATA directory might cause a failure
+	 * that won't go away).  100 tries seems like plenty.
+	 */
+	for (ntries = 0;; ntries++)
+	{
+		/*
+		 * Try to create the lock file --- O_EXCL makes this atomic.
+		 *
+		 * Think not to make the file protection weaker than 0600.	See
+		 * comments below.
+		 */
+		fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600);
+		if (fd >= 0)
+			break;				/* Success; exit the retry loop */
+
+		/*
+		 * Couldn't create the pid file. Probably it already exists.
+		 */
+		if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not create lock file \"%s\": %m",
+							filename)));
+
+		/*
+		 * Read the file to get the old owner's PID.  Note race condition
+		 * here: file might have been deleted since we tried to create it.
+		 */
+		fd = open(filename, O_RDONLY, 0600);
+		if (fd < 0)
+		{
+			if (errno == ENOENT)
+				continue;		/* race condition; try again */
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not open lock file \"%s\": %m",
+							filename)));
+		}
+		if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
+			ereport(FATAL,
+					(EINVAL,
+					 errmsg("could not read lock file \"%s\": %m",
+							filename)));
+		close(fd);
+
+		buffer[len] = '\0';
+		encoded_pid = atoi(buffer);
+		other_pid = (pid_t) encoded_pid;
+
+		if (other_pid <= 0)
+			elog(FATAL, "bogus data in lock file \"%s\": \"%s\"",
+				 filename, buffer);
+
+		/*
+		 * Check to see if the other process still exists
+		 *
+		 * If the PID in the lockfile is our own PID or our parent's PID, then
+		 * the file must be stale (probably left over from a previous system
+		 * boot cycle).  We need this test because of the likelihood that a
+		 * reboot will assign exactly the same PID as we had in the previous
+		 * reboot.	Also, if there is just one more process launch in this
+		 * reboot than in the previous one, the lockfile might mention our
+		 * parent's PID.  We can reject that since we'd never be launched
+		 * directly by a competing postmaster.	We can't detect grandparent
+		 * processes unfortunately, but if the init script is written
+		 * carefully then all but the immediate parent shell will be
+		 * root-owned processes and so the kill test will fail with EPERM.
+		 *
+		 * We can treat the EPERM-error case as okay because that error
+		 * implies that the existing process has a different userid than we
+		 * do, which means it cannot be a competing postmaster.  A postmaster
+		 * cannot successfully attach to a data directory owned by a userid
+		 * other than its own.	(This is now checked directly in
+		 * checkDataDir(), but has been true for a long time because of the
+		 * restriction that the data directory isn't group- or
+		 * world-accessible.)  Also, since we create the lockfiles mode 600,
+		 * we'd have failed above if the lockfile belonged to another userid
+		 * --- which means that whatever process kill() is reporting about
+		 * isn't the one that made the lockfile.  (NOTE: this last
+		 * consideration is the only one that keeps us from blowing away a
+		 * Unix socket file belonging to an instance of Postgres being run by
+		 * someone else, at least on machines where /tmp hasn't got a
+		 * stickybit.)
+		 *
+		 * Windows hasn't got getppid(), but doesn't need it since it's not
+		 * using real kill() either...
+		 *
+		 * Normally kill() will fail with ESRCH if the given PID doesn't
+		 * exist.
+		 */
+		if (other_pid != my_pid
+#ifndef WIN32
+			&& other_pid != getppid()
+#endif
+			)
+		{
+			if (kill(other_pid, 0) == 0 ||
+				(errno != ESRCH && errno != EPERM))
+			{
+				/* lockfile belongs to a live process */
+				ereport(FATAL,
+						(EINVAL,
+						 errmsg("lock file \"%s\" already exists",
+								filename),
+						  errhint("Is another GTM proxy (PID %d) running in data directory \"%s\"?",
+								  (int) other_pid, refName)));
+			}
+		}
+
+		/*
+		 * Looks like nobody's home.  Unlink the file and try again to create
+		 * it.	Need a loop because of possible race condition against other
+		 * would-be creators.
+		 */
+		if (unlink(filename) < 0)
+			ereport(FATAL,
+					(EACCES,
+					 errmsg("could not remove old lock file \"%s\": %m",
+							filename),
+					 errhint("The file seems accidentally left over, but "
+						   "it could not be removed. Please remove the file "
+							 "by hand and try again.")));
+	}
+
+	/*
+	 * Successfully created the file, now fill it.
+	 */
+	snprintf(buffer, sizeof(buffer), "%d\n%s\n",
+			 (int) my_pid, GTMProxyDataDir);
+	errno = 0;
+	if (write(fd, buffer, strlen(buffer)) != strlen(buffer))
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		unlink(filename);
+		/* if write didn't set errno, assume problem is no disk space */
+		errno = save_errno ? save_errno : ENOSPC;
+		ereport(FATAL,
+				(EACCES,
+				 errmsg("could not write lock file \"%s\": %m", filename)));
+	}
+	if (close(fd))
+	{
+		int			save_errno = errno;
+
+		unlink(filename);
+		errno = save_errno;
+		ereport(FATAL,
+				(EACCES,
+				 errmsg("could not write lock file \"%s\": %m", filename)));
+	}
+
+}
+
+/*
+ * Create the opts file
+ */
+static bool
+CreateOptsFile(int argc, char *argv[])
+{
+	FILE	   *fp;
+	int			i;
+
+#define OPTS_FILE	"gtm_proxy.opts"
+
+	if ((fp = fopen(OPTS_FILE, "w")) == NULL)
+	{
+		elog(LOG, "could not create file \"%s\": %m", OPTS_FILE);
+		return false;
+	}
+
+	for (i = 1; i < argc; i++)
+		fprintf(fp, " \"%s\"", argv[i]);
+	fputs("\n", fp);
+
+	if (fclose(fp))
+	{
+		elog(LOG, "could not write file \"%s\": %m", OPTS_FILE);
+		return false;
+	}
+
+	return true;
+}
+
+/* delete pid file */
+static void
+DeleteLockFile(const char *filename)
+{
+	if (unlink(filename) < 0)
+		ereport(FATAL,
+				(EACCES,
+				 errmsg("could not remove old lock file \"%s\": %m",
+						filename),
+				 errhint("The file seems accidentally left over, but "
+						 "it could not be removed. Please remove the file "
+						 "by hand and try again.")));
+}
diff --git a/src/gtm/proxy/proxy_thread.c b/src/gtm/proxy/proxy_thread.c
new file mode 100644
index 0000000000..844f2f70b4
--- /dev/null
+++ b/src/gtm/proxy/proxy_thread.c
@@ -0,0 +1,451 @@
+/*-------------------------------------------------------------------------
+ *
+ * proxy_thread.c
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <pthread.h>
+#include "gtm/gtm_proxy.h"
+#include "gtm/memutils.h"
+#include "gtm/libpq.h"
+
+static void *GTMProxy_ThreadMainWrapper(void *argp);
+static void GTMProxy_ThreadCleanup(void *argp);
+
+GTMProxy_Threads	GTMProxyThreadsData;
+GTMProxy_Threads *GTMProxyThreads = &GTMProxyThreadsData;
+
+#define GTM_PROXY_MIN_THREADS 32			/* Provision for minimum threads */
+#define GTM_PROXY_MAX_THREADS 1024		/* Max threads allowed in the GTMProxy */
+#define GTMProxyThreadsFull	(GTMProxyThreads->gt_thread_count == GTMProxyThreads->gt_array_size)	
+
+/*
+ * Add the given thrinfo structure to the global array, expanding it if
+ * necessary
+ */
+int
+GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo)
+{
+	int ii;
+
+	GTM_RWLockAcquire(&GTMProxyThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+	if (GTMProxyThreadsFull)
+	{
+		GTMProxy_ThreadInfo **threads;
+		uint32 newsize;
+	   
+		/*
+		 * TODO Optimize lock management by not holding any locks during memory
+		 * allocation 
+		 */
+		if (GTMProxyThreads->gt_array_size == GTM_PROXY_MAX_THREADS)
+			elog(ERROR, "Too many threads active");
+
+		if (GTMProxyThreads->gt_array_size == 0)
+			newsize = GTM_PROXY_MIN_THREADS;
+		else
+		{
+			/*
+			 * We ran out of the array size. Just double the size, bound by the
+			 * upper limit
+			 */
+			newsize = GTMProxyThreads->gt_array_size * 2;
+		}
+
+		/* Can't have more than GTM_PROXY_MAX_THREADS */
+		if (newsize > GTM_PROXY_MAX_THREADS)
+			newsize = GTM_PROXY_MAX_THREADS;
+
+		if (GTMProxyThreads->gt_threads == NULL)
+			threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize);
+		else
+		{
+			void *old_ptr = GTMProxyThreads->gt_threads;
+			threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize);
+			memcpy(threads, old_ptr,
+					GTMProxyThreads->gt_array_size * sizeof (GTMProxy_ThreadInfo *));
+			pfree(old_ptr);
+		}
+
+		GTMProxyThreads->gt_threads = threads;
+		GTMProxyThreads->gt_array_size = newsize;
+	}
+
+	/*
+	 * Now that we have free entries in the array, find a free slot and add the
+	 * thrinfo pointer to it.
+	 *
+	 * TODO Optimize this later by tracking few free slots and reusing them.
+	 * The free slots can be updated when a thread exits and reused when a new
+	 * thread is added to the pool.
+	 */
+	for (ii = 0; ii < GTMProxyThreads->gt_array_size; ii++)
+	{
+		if (GTMProxyThreads->gt_threads[ii] == NULL)
+		{
+			GTMProxyThreads->gt_threads[ii] = thrinfo;
+			GTMProxyThreads->gt_thread_count++;
+			break;
+		}
+	}
+	GTM_RWLockRelease(&GTMProxyThreads->gt_lock);
+
+	/* 
+	 * Track the slot information in the thrinfo. This is useful to quickly
+	 * find the slot given the thrinfo structure.
+	 */
+	thrinfo->thr_localid = ii;
+	return ii;
+}
+
+int
+GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo)
+{
+	/*
+	 * XXX To be implemeneted
+	 */
+	return 0;
+}
+
+/*
+ * Create a new thread and assign the given connection to it.
+ *
+ * This function is responsible for setting up the various memory contextes for
+ * the thread as well as registering this thread with the Thread Manager.
+ *
+ * Upon successful creation, the thread will start running the given
+ * "startroutine". The thread information is returned to the calling process.
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_ThreadCreate(void *(* startroutine)(void *))
+{
+	GTMProxy_ThreadInfo *thrinfo;
+	int err;
+
+	/*
+	 * We are still running in the context of the main thread. So the
+	 * allocation below would last as long as the main thread exists or the
+	 * memory is explicitely freed.
+	 */
+	thrinfo = (GTMProxy_ThreadInfo *)palloc0(sizeof (GTMProxy_ThreadInfo));
+
+	GTM_MutexLockInit(&thrinfo->thr_lock);
+	GTM_CVInit(&thrinfo->thr_cv);
+
+	/*
+	 * The thread status is set to GTM_PROXY_THREAD_STARTING and will be changed by
+	 * the thread itself when it actually starts executing
+	 */
+	thrinfo->thr_status = GTM_PROXY_THREAD_STARTING;
+
+	/*
+	 * Install the ThreadInfo structure in the global array. We do this before
+	 * starting the thread
+	 */
+	if (GTMProxy_ThreadAdd(thrinfo) == -1)
+		elog(ERROR, "Error starting a new thread");
+
+	/*
+	 * Set up memory contextes before actually starting the threads
+	 *
+	 * The TopThreadContext is a child of TopMemoryContext and it will last as
+	 * long as the main process or this thread lives
+	 *
+	 * Thread context is not shared between other threads
+	 */
+	thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext,
+														"TopMemoryContext",
+														ALLOCSET_DEFAULT_MINSIZE,
+														ALLOCSET_DEFAULT_INITSIZE,
+														ALLOCSET_DEFAULT_MAXSIZE,
+														false);
+
+	/*
+	 * Since the thread is not yes started, TopMemoryContext still points to
+	 * the context of the calling thread
+	 */
+	thrinfo->thr_parent_context = TopMemoryContext;
+
+	/*
+	 * Each thread gets its own ErrorContext and its a child of ErrorContext of
+	 * the main process
+	 *
+	 * This is a thread-specific context and is not shared between other
+	 * threads
+	 */
+	thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
+													   "ErrorContext",
+													   8 * 1024,
+													   8 * 1024,
+													   8 * 1024,
+													   false);
+
+	thrinfo->thr_startroutine = startroutine;
+
+	/*
+	 * Now start the thread. The thread will start executing the given
+	 * "startroutine". The thrinfo structure is also passed to the thread. Any
+	 * additional parameters should be passed via the thrinfo strcuture.
+	 *
+	 * Return the thrinfo structure to the caller
+	 */
+	if ((err = pthread_create(&thrinfo->thr_id, NULL, GTMProxy_ThreadMainWrapper,
+							 thrinfo)))
+		ereport(ERROR,
+				(err,
+				 errmsg("Failed to create a new thread: error %d", err)));
+
+	return thrinfo;
+}
+
+/*
+ * Exit the current thread
+ */
+void
+GTMProxy_ThreadExit(void)
+{
+	/* XXX To be implemented */
+}
+
+int
+GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo)
+{
+	int error;
+	void *data;
+
+	error = pthread_join(thrinfo->thr_id, &data);
+
+	return error;
+}
+
+/*
+ * Get thread information for the given thread, identified by the
+ * thread_id
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_GetThreadInfo(GTM_ThreadID thrid)
+{
+
+	return NULL;
+}
+
+/*
+ * Cleanup routine for the thread
+ */
+static void
+GTMProxy_ThreadCleanup(void *argp)
+{
+	GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+
+	elog(LOG, "Cleaning up thread state");
+
+	/*
+	 * TODO Close the open connection.
+	 */
+	StreamClose(thrinfo->thr_conn->con_port->sock);
+
+	/*
+	 * Switch to the memory context of the main process so that we can free up
+	 * our memory contextes easily.
+	 *
+	 * XXX We don't setup cleanup handlers for the main process. So this
+	 * routine would never be called for the main process/thread
+	 */
+	MemoryContextSwitchTo(thrinfo->thr_parent_context);
+
+	MemoryContextDelete(thrinfo->thr_message_context);
+	thrinfo->thr_message_context = NULL;
+
+	MemoryContextDelete(thrinfo->thr_error_context);
+	thrinfo->thr_error_context = NULL;
+
+	MemoryContextDelete(thrinfo->thr_thread_context);
+	thrinfo->thr_thread_context = NULL;
+
+	/*
+	 * TODO Now cleanup the thrinfo structure itself and remove it from the global
+	 * array.
+	 */
+
+
+	/*
+	 * Reset the thread-specific information. This should be done only after we
+	 * are sure that memory contextes are not required 
+	 *
+	 * Note: elog calls need memory contextes, so no elog calls beyond this
+	 * point.
+	 */
+	SetMyThreadInfo(NULL);
+	
+	return;
+}
+
+/*
+ * A wrapper around the start routine of the thread. This helps us doing any
+ * initialization and setting up cleanup handlers before the main routine is
+ * started
+ */
+void *
+GTMProxy_ThreadMainWrapper(void *argp)
+{
+	GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
+
+	pthread_detach(thrinfo->thr_id);
+
+	SetMyThreadInfo(thrinfo);
+	MemoryContextSwitchTo(TopMemoryContext);
+	
+	pthread_cleanup_push(GTMProxy_ThreadCleanup, thrinfo);
+	thrinfo->thr_startroutine(thrinfo);
+	pthread_cleanup_pop(1);
+
+	return thrinfo;
+}
+
+/*
+ * Add the given connection info structure to a thread which is selected by a
+ * round-robin manner. The caller is responsible for only accepting the
+ * connection. Other things including the authentication is done by the worker
+ * thread when it finds a new entry in the connection list.
+ *
+ * Return the reference to the GTMProxy_ThreadInfo structure of the thread
+ * which will be serving this connection
+ */
+GTMProxy_ThreadInfo *
+GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo)
+{
+	GTMProxy_ThreadInfo *thrinfo = NULL;
+
+	/*
+	 * Get the next thread in the queue
+	 */
+	GTM_RWLockAcquire(&GTMProxyThreads->gt_lock, GTM_LOCKMODE_WRITE);
+
+	/*
+	 * Always start with thread 1 because thread 0 is the main thread
+	 */
+	if (GTMProxyThreads->gt_next_worker == 0)
+		GTMProxyThreads->gt_next_worker = 1;
+
+	thrinfo = GTMProxyThreads->gt_threads[GTMProxyThreads->gt_next_worker];
+
+	/*
+	 * Set the next worker thread before releasing the lock
+	 */
+	GTMProxyThreads->gt_next_worker++;
+	if (GTMProxyThreads->gt_next_worker == GTMProxyThreads->gt_thread_count)
+	   GTMProxyThreads->gt_next_worker = 1;
+
+	GTM_RWLockRelease(&GTMProxyThreads->gt_lock);	
+
+	/*
+	 * Lock the threadninfo structure to safely add the new connection to the
+	 * thread structure. The thread will see the connection when it queries the
+	 * socket descriptor in the next cycle
+	 */
+	GTM_MutexLockAcquire(&thrinfo->thr_lock);
+	
+	if (thrinfo->thr_conn_count >= GTM_PROXY_MAX_CONNECTIONS)
+	{
+		GTM_MutexLockRelease(&thrinfo->thr_lock);
+		elog(ERROR, "Too many connections");
+	}
+
+	/*
+	 * Save the array slotid in the conninfo structure. We send this to the GTM
+	 * server as an identifier which the GTM server sends us back in the
+	 * response. We use that information to route the response back to the
+	 * approrpiate connection
+	 */
+	conninfo->con_id = thrinfo->thr_conn_count;
+	thrinfo->thr_all_conns[thrinfo->thr_conn_count] = conninfo;
+	thrinfo->thr_conn_count++;
+
+	/*
+	 * Now increment the seqno since a new connection is added to the array.
+	 * Before we do the next poll(), the fd array will be forced to be
+	 * reconstructed.
+	 */
+   	thrinfo->thr_seqno++;
+
+	/*
+	 * Signal the worker thread if its waiting for connections to be added to
+	 * its Q
+	 *
+	 * XXX May be we can first check the condition that this is the first
+	 * connection in the array and also use signal instead of a bcast since
+	 * only one thread is waiting on the cv.
+	 */
+	GTM_CVBcast(&thrinfo->thr_cv);
+	GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+	return thrinfo;
+}
+
+/*
+ * Remove the connection from the array and compact the array
+ */
+int
+GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo, GTMProxy_ConnectionInfo *conninfo)
+{
+	int ii;
+
+	/*
+	 * Lock the threadninfo structure to safely remove the connection from the
+	 * thread structure.
+	 */
+	GTM_MutexLockAcquire(&thrinfo->thr_lock);
+	
+	for (ii = 0; ii < thrinfo->thr_conn_count; ii++)
+	{
+		if (thrinfo->thr_all_conns[ii] == conninfo)
+			break;
+	}
+
+	if (ii >= thrinfo->thr_conn_count)
+	{
+		GTM_MutexLockRelease(&thrinfo->thr_lock);
+		elog(ERROR, "No such connection");
+	}
+
+	/*
+	 * If this is the last entry in the array ? If not, then copy the last
+	 * entry in this slot and mark the last slot an empty
+	 */
+	if ((ii + 1) < thrinfo->thr_conn_count)
+	{
+		/* Copy the last entry in this slot */
+		thrinfo->thr_all_conns[ii] = thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1];
+
+		/* Mark the last slot free */
+		thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1] = NULL;
+
+		/* Adjust the con_id to reflect the current slot in the array */
+		thrinfo->thr_all_conns[ii]->con_id = ii;
+	}
+	else
+	{
+		/* This is the last entry in the array. Just mark it free */
+		thrinfo->thr_all_conns[ii] = NULL;
+	}
+
+	thrinfo->thr_conn_count--;
+
+	/* 
+	 * Increment the seqno to ensure that the next time before we poll, the fd
+	 * array is reconstructed.
+	 */
+	thrinfo->thr_seqno++;
+	GTM_MutexLockRelease(&thrinfo->thr_lock);
+
+	return 0;
+}
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
new file mode 100644
index 0000000000..66ca3f12c6
--- /dev/null
+++ b/src/include/access/gtm.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.h
+ * 
+ *	  Module interfacing with GTM definitions
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ACCESS_GTM_H
+#define ACCESS_GTM_H
+
+#include "gtm/gtm_c.h"
+
+/* Configuration variables */
+extern char *GtmHost;
+extern int GtmPort;
+extern int GtmCoordinatorId;
+
+extern bool IsGTMConnected(void);
+extern void InitGTM(void);
+extern void CloseGTM(void);
+extern GlobalTransactionId BeginTranGTM(void);
+extern GlobalTransactionId BeginTranAutovacuumGTM(void);
+extern int CommitTranGTM(GlobalTransactionId gxid);
+extern int RollbackTranGTM(GlobalTransactionId gxid);
+extern GTM_Snapshot GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped);
+extern GTM_Sequence GetNextValGTM(char *seqname);
+extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment, 
+		GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval,
+		bool cycle);
+extern int DropSequenceGTM(char *seqname);
+#endif /* ACCESS_GTM_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b23a663c53..a7a8230595 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.68 2009/05/08 03:21:35 momjian Exp $
  *
@@ -152,6 +153,11 @@ extern TransactionId TransactionIdLatest(TransactionId mainxid,
 extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
 
 /* in transam/varsup.c */
+#ifdef PGXC  /* PGXC_DATANODE */
+extern void SetNextTransactionId(TransactionId xid);
+extern void SetForceXidFromGTM(bool value);
+extern bool GetForceXidFromGTM(void);
+#endif /* PGXC */
 extern TransactionId GetNewTransactionId(bool isSubXact);
 extern TransactionId ReadNewTransactionId(void);
 extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 880b41b707..7cd8e165ec 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.98 2009/06/11 14:49:09 momjian Exp $
  *
@@ -18,7 +19,9 @@
 #include "nodes/pg_list.h"
 #include "storage/relfilenode.h"
 #include "utils/timestamp.h"
-
+#ifdef PGXC  /* PGXC_COORD */
+#include "gtm/gtm_c.h"
+#endif
 
 /*
  * Xact isolation levels
@@ -145,6 +148,9 @@ extern TransactionId GetTopTransactionId(void);
 extern TransactionId GetTopTransactionIdIfAny(void);
 extern TransactionId GetCurrentTransactionId(void);
 extern TransactionId GetCurrentTransactionIdIfAny(void);
+#ifdef PGXC  /* PGXC_COORD */
+extern GlobalTransactionId GetCurrentGlobalTransactionId(void);
+#endif
 extern SubTransactionId GetCurrentSubTransactionId(void);
 extern CommandId GetCurrentCommandId(bool used);
 extern TimestampTz GetCurrentTransactionStartTimestamp(void);
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index ab549eabb1..e8f96604ad 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.51 2009/01/01 17:23:56 momjian Exp $
  *
@@ -71,6 +72,9 @@ typedef enum
 	StartupProcess,
 	BgWriterProcess,
 	WalWriterProcess
+#ifdef PGXC
+	,PoolerProcess
+#endif
 } AuxProcType;
 
 #endif   /* BOOTSTRAP_H */
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index fe04aab964..b2af292585 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/catalog/dependency.h,v 1.40 2009/06/11 14:49:09 momjian Exp $
  *
@@ -146,6 +147,9 @@ typedef enum ObjectClass
 	OCLASS_FDW,					/* pg_foreign_data_wrapper */
 	OCLASS_FOREIGN_SERVER,		/* pg_foreign_server */
 	OCLASS_USER_MAPPING,		/* pg_user_mapping */
+#ifdef PGXC
+	OCLASS_PGXC_CLASS,		/* pgxc_class */
+#endif
 	MAX_OCLASS					/* MUST BE LAST */
 } ObjectClass;
 
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index 2d6eb3c34a..baa9ecaf49 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.91 2009/06/11 14:49:09 momjian Exp $
  *
@@ -107,4 +108,11 @@ extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind);
 
 extern void CheckAttributeType(const char *attname, Oid atttypid);
 
+#ifdef PGXC
+extern void AddRelationDistribution (Oid relid, 
+				DistributeBy *distributeby,
+				List 		 *parentOids,
+				TupleDesc	 descriptor);
+#endif
+
 #endif   /* HEAP_H */
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index ce117a8eec..5557021e30 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -7,6 +7,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/catalog/indexing.h,v 1.108 2009/06/11 14:49:09 momjian Exp $
  *
@@ -267,6 +268,11 @@ DECLARE_UNIQUE_INDEX(pg_user_mapping_oid_index, 174, on pg_user_mapping using bt
 DECLARE_UNIQUE_INDEX(pg_user_mapping_user_server_index, 175, on pg_user_mapping using btree(umuser oid_ops, umserver oid_ops));
 #define UserMappingUserServerIndexId	175
 
+#ifdef PGXC
+DECLARE_UNIQUE_INDEX(pgxc_class_pcrelid_index, 9002, on pgxc_class using btree(pcrelid oid_ops));
+#define PgxcClassPgxcRelIdIndexId 	9002
+#endif
+
 /* last step of initialization script: build the indexes declared above */
 BUILD_INDICES
 
diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h
new file mode 100644
index 0000000000..2104e53e42
--- /dev/null
+++ b/src/include/catalog/pgxc_class.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2004-2007 EnterpriseDB Corporation. All Rights Reserved.
+ */
+#ifndef PGXC_CLASS_H
+#define PGXC_CLASS_H
+
+#include "nodes/parsenodes.h"
+
+#define PgxcClassRelationId  9001
+
+CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS
+{
+	Oid			pcrelid;
+	char		pclocatortype;
+	int2		pcattnum;
+	int2 		pchashalgorithm;
+	int2 		pchashbuckets;
+} FormData_pgxc_class;
+
+typedef FormData_pgxc_class *Form_pgxc_class;
+
+#define Natts_pgxc_class					5
+
+#define Anum_pgxc_class_pcrelid			1
+#define Anum_pgxc_class_pclocatortype	2
+#define Anum_pgxc_class_pcattnum			3
+#define Anum_pgxc_class_pchashalgorithm	4
+#define Anum_pgxc_class_pchashbuckets	5
+
+extern void PgxcClassCreate(Oid pcrelid,
+					char  pclocatortype,
+					int pcattnum,
+					int pchashalgorithm,
+					int pchashbuckets);
+
+extern void RemovePgxcClass(Oid pcrelid);
+
+#endif   /* PGXC_CLASS_H */
+
diff --git a/src/include/gtm/assert.h b/src/include/gtm/assert.h
new file mode 100644
index 0000000000..5c71363832
--- /dev/null
+++ b/src/include/gtm/assert.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * assert.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_ASSERT_H
+#define GTM_ASSERT_H
+
+extern bool assert_enabled;
+
+/*
+ * USE_ASSERT_CHECKING, if defined, turns on all the assertions.
+ * - plai  9/5/90
+ *
+ * It should _NOT_ be defined in releases or in benchmark copies
+ */
+
+/*
+ * Trap
+ *		Generates an exception if the given condition is true.
+ */
+#define Trap(condition, errorType) \
+	do { \
+		if ((assert_enabled) && (condition)) \
+			ExceptionalCondition(CppAsString(condition), (errorType), \
+								 __FILE__, __LINE__); \
+	} while (0)
+
+/*
+ *	TrapMacro is the same as Trap but it's intended for use in macros:
+ *
+ *		#define foo(x) (AssertMacro(x != 0) && bar(x))
+ *
+ *	Isn't CPP fun?
+ */
+#define TrapMacro(condition, errorType) \
+	((bool) ((! assert_enabled) || ! (condition) || \
+			 (ExceptionalCondition(CppAsString(condition), (errorType), \
+								   __FILE__, __LINE__))))
+
+#ifndef USE_ASSERT_CHECKING
+#define Assert(condition)
+#define AssertMacro(condition)	((void)true)
+#define AssertArg(condition)
+#define AssertState(condition)
+#else
+#define Assert(condition) \
+		Trap(!(condition), "FailedAssertion")
+
+#define AssertMacro(condition) \
+		((void) TrapMacro(!(condition), "FailedAssertion"))
+
+#define AssertArg(condition) \
+		Trap(!(condition), "BadArgument")
+
+#define AssertState(condition) \
+		Trap(!(condition), "BadState")
+#endif   /* USE_ASSERT_CHECKING */
+
+extern int ExceptionalCondition(const char *conditionName,
+					 const char *errorType,
+					 const char *fileName, int lineNumber);
+
+#endif
diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h
new file mode 100644
index 0000000000..49c463fa3e
--- /dev/null
+++ b/src/include/gtm/elog.h
@@ -0,0 +1,253 @@
+/*-------------------------------------------------------------------------
+ *
+ * elog.h
+ *	  POSTGRES error reporting/logging definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/elog.h,v 1.98 2009/01/01 17:24:02 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ELOG_H
+#define ELOG_H
+
+/* Error level codes */
+#define DEBUG5		10			/* Debugging messages, in categories of
+								 * decreasing detail. */
+#define DEBUG4		11
+#define DEBUG3		12
+#define DEBUG2		13
+#define DEBUG1		14			/* used by GUC debug_* variables */
+#define LOG			15			/* Server operational messages; sent only to
+								 * server log by default. */
+#define COMMERROR	16			/* Client communication problems; same as LOG
+								 * for server reporting, but never sent to
+								 * client. */
+#define INFO		17			/* Messages specifically requested by user
+								 * (eg VACUUM VERBOSE output); always sent to
+								 * client regardless of client_min_messages,
+								 * but by default not sent to server log. */
+#define NOTICE		18			/* Helpful messages to users about query
+								 * operation; sent to client and server log
+								 * by default. */
+#define WARNING		19			/* Warnings.  NOTICE is for expected messages
+								 * like implicit sequence creation by SERIAL.
+								 * WARNING is for unexpected messages. */
+#define ERROR		20			/* user error - abort transaction; return to
+								 * known state */
+#define ERROR2		21			/* user error - only send error message to the
+								 * client */
+#define FATAL		22			/* fatal error - abort process */
+#define PANIC		23			/* take down the other backends with me */
+
+ /* #define DEBUG DEBUG1 */	/* Backward compatibility with pre-7.3 */
+
+
+/* Which __func__ symbol do we have, if any? */
+#ifdef HAVE_FUNCNAME__FUNC
+#define PG_FUNCNAME_MACRO	__func__
+#else
+#ifdef HAVE_FUNCNAME__FUNCTION
+#define PG_FUNCNAME_MACRO	__FUNCTION__
+#else
+#define PG_FUNCNAME_MACRO	NULL
+#endif
+#endif
+
+/*
+ * ErrorData holds the data accumulated during any one ereport() cycle.
+ * Any non-NULL pointers must point to palloc'd data.
+ * (The const pointers are an exception; we assume they point at non-freeable
+ * constant strings.)
+ */
+typedef struct ErrorData
+{
+	int			elevel;			/* error level */
+	bool		output_to_server;		/* will report to server log? */
+	bool		output_to_client;		/* will report to client? */
+	bool		show_funcname;	/* true to force funcname inclusion */
+	const char *filename;		/* __FILE__ of ereport() call */
+	int			lineno;			/* __LINE__ of ereport() call */
+	const char *funcname;		/* __func__ of ereport() call */
+	const char *domain;			/* message domain */
+	char	   *message;		/* primary error message */
+	char	   *detail;			/* detail error message */
+	char	   *detail_log;		/* detail error message for server log only */
+	char	   *hint;			/* hint message */
+	char	   *context;		/* context message */
+	int			saved_errno;	/* errno at entry */
+} ErrorData;
+
+
+/*----------
+ * New-style error reporting API: to be used in this way:
+ *		ereport(ERROR,
+ *				(errcode(ERRCODE_UNDEFINED_CURSOR),
+ *				 errmsg("portal \"%s\" not found", stmt->portalname),
+ *				 ... other errxxx() fields as needed ...));
+ *
+ * The error level is required, and so is a primary error message (errmsg
+ * or errmsg_internal).  All else is optional.	errcode() defaults to
+ * ERRCODE_INTERNAL_ERROR if elevel is ERROR or more, ERRCODE_WARNING
+ * if elevel is WARNING, or ERRCODE_SUCCESSFUL_COMPLETION if elevel is
+ * NOTICE or below.
+ *
+ * ereport_domain() allows a message domain to be specified, for modules that
+ * wish to use a different message catalog from the backend's.	To avoid having
+ * one copy of the default text domain per .o file, we define it as NULL here
+ * and have errstart insert the default text domain.  Modules can either use
+ * ereport_domain() directly, or preferably they can override the TEXTDOMAIN
+ * macro.
+ *----------
+ */
+#define TEXTDOMAIN "GTM"
+
+#define ereport_domain(elevel, domain, rest)	\
+	(errstart(elevel, __FILE__, __LINE__, PG_FUNCNAME_MACRO, domain) ? \
+	 (errfinish rest) : (void) 0)
+
+#define ereport(level, rest)	\
+	ereport_domain(level, TEXTDOMAIN, rest)
+
+
+#define PG_RE_THROW()		pg_re_throw()
+
+extern bool errstart(int elevel, const char *filename, int lineno,
+		 const char *funcname, const char *domain);
+extern void errfinish(int dummy,...);
+
+extern int
+errmsg(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errmsg_internal(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errdetail(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errdetail_log(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+extern int
+errhint(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+/*----------
+ * Old-style error reporting API: to be used in this way:
+ *		elog(ERROR, "portal \"%s\" not found", stmt->portalname);
+ *----------
+ */
+#define elog	elog_start(__FILE__, __LINE__, PG_FUNCNAME_MACRO), elog_finish
+
+extern void elog_start(const char *filename, int lineno, const char *funcname);
+extern void
+elog_finish(int elevel, const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 2, 3)));
+
+/*----------
+ * API for catching ereport(ERROR) exits.  Use these macros like so:
+ *
+ *		PG_TRY();
+ *		{
+ *			... code that might throw ereport(ERROR) ...
+ *		}
+ *		PG_CATCH();
+ *		{
+ *			... error recovery code ...
+ *		}
+ *		PG_END_TRY();
+ *
+ * (The braces are not actually necessary, but are recommended so that
+ * pg_indent will indent the construct nicely.)  The error recovery code
+ * can optionally do PG_RE_THROW() to propagate the same error outwards.
+ *
+ * Note: while the system will correctly propagate any new ereport(ERROR)
+ * occurring in the recovery section, there is a small limit on the number
+ * of levels this will work for.  It's best to keep the error recovery
+ * section simple enough that it can't generate any new errors, at least
+ * not before popping the error stack.
+ *
+ * Note: an ereport(FATAL) will not be caught by this construct; control will
+ * exit straight through proc_exit().  Therefore, do NOT put any cleanup
+ * of non-process-local resources into the error recovery section, at least
+ * not without taking thought for what will happen during ereport(FATAL).
+ * The PG_ENSURE_ERROR_CLEANUP macros provided by storage/ipc.h may be
+ * helpful in such cases.
+ *----------
+ */
+#define PG_TRY()  \
+	do { \
+		sigjmp_buf *save_exception_stack = PG_exception_stack; \
+		sigjmp_buf local_sigjmp_buf; \
+		if (sigsetjmp(local_sigjmp_buf, 0) == 0) \
+		{ \
+			PG_exception_stack = &local_sigjmp_buf
+
+#define PG_CATCH()	\
+		} \
+		else \
+		{ \
+			PG_exception_stack = save_exception_stack; \
+
+#define PG_END_TRY()  \
+		} \
+		PG_exception_stack = save_exception_stack; \
+	} while (0)
+
+int errfunction(const char *funcname);
+
+extern void EmitErrorReport(void *port);
+
+/* GUC-configurable parameters */
+
+typedef enum
+{
+	PGERROR_TERSE,				/* single-line error messages */
+	PGERROR_DEFAULT,			/* recommended style */
+	PGERROR_VERBOSE				/* all the facts, ma'am */
+} PGErrorVerbosity;
+
+/* Log destination bitmap */
+#define LOG_DESTINATION_STDERR	 1
+#define LOG_DESTINATION_SYSLOG	 2
+#define LOG_DESTINATION_EVENTLOG 4
+#define LOG_DESTINATION_CSVLOG	 8
+
+/* Other exported functions */
+extern void pg_re_throw(void);
+extern void DebugFileOpen(void);
+extern void FlushErrorState(void);
+
+
+/*
+ * Write errors to stderr (or by equal means when stderr is
+ * not available). Used before ereport/elog can be used
+ * safely (memory context, GUC load etc)
+ */
+extern void
+write_stderr(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+   the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+
+#endif   /* ELOG_H */
diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h
new file mode 100644
index 0000000000..37e23a7ffa
--- /dev/null
+++ b/src/include/gtm/gtm.h
@@ -0,0 +1,140 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_H
+#define _GTM_H
+
+#include <setjmp.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_conn.h"
+#include "gtm/elog.h"
+#include "gtm/gtm_list.h"
+
+extern char *GTMLogFile;
+
+typedef enum GTM_ThreadStatus
+{
+	GTM_THREAD_STARTING,
+	GTM_THREAD_RUNNING,
+	GTM_THREAD_EXITING,
+	/* Must be the last */
+	GTM_THREAD_INVALID
+} GTM_ThreadStatus;
+
+struct GTM_ConnectionInfo;
+
+#define ERRORDATA_STACK_SIZE  5
+
+typedef struct GTM_ThreadInfo
+{
+	/*
+	 * Thread specific information such as connection(s) served by it
+	 */
+	GTM_ThreadID		thr_id;
+	uint32				thr_localid;
+	void * (* thr_startroutine)(void *);
+	
+	MemoryContext	thr_thread_context;
+	MemoryContext	thr_message_context;	
+	MemoryContext	thr_current_context;
+	MemoryContext	thr_error_context;
+	MemoryContext	thr_parent_context;
+
+	sigjmp_buf		*thr_sigjmp_buf;
+
+	ErrorData		thr_error_data[ERRORDATA_STACK_SIZE];
+	int				thr_error_stack_depth;
+	int				thr_error_recursion_depth;
+	int				thr_criticalsec_count;
+
+	GTM_ThreadStatus	thr_status;
+	GTM_ConnectionInfo	*thr_conn;
+
+	GTM_RWLock			thr_lock;
+	List				*thr_cached_txninfo;
+
+} GTM_ThreadInfo;
+
+typedef struct GTM_Threads
+{
+	uint32				gt_thread_count;
+	uint32				gt_array_size;
+	GTM_ThreadInfo		**gt_threads;
+	GTM_RWLock			gt_lock;
+} GTM_Threads;
+
+extern GTM_Threads *GTMThreads;
+
+int GTM_ThreadAdd(GTM_ThreadInfo *thrinfo);
+int GTM_ThreadRemove(GTM_ThreadInfo *thrinfo);
+int GTM_ThreadJoin(GTM_ThreadInfo *thrinfo);
+void GTM_ThreadExit(void);
+void ConnFree(Port *port);
+
+GTM_ThreadInfo *GTM_ThreadCreate(GTM_ConnectionInfo *conninfo,
+				  void *(* startroutine)(void *));
+GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid);
+
+/*
+ * pthread keys to get thread specific information
+ */
+extern pthread_key_t					threadinfo_key;
+extern MemoryContext					TopMostMemoryContext;
+
+#define SetMyThreadInfo(thrinfo)		pthread_setspecific(threadinfo_key, (thrinfo))
+#define GetMyThreadInfo					((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key))
+
+#define TopMemoryContext		(GetMyThreadInfo->thr_thread_context)
+#define ThreadTopContext		(GetMyThreadInfo->thr_thread_context)
+#define MessageContext			(GetMyThreadInfo->thr_message_context)
+#define CurrentMemoryContext	(GetMyThreadInfo->thr_current_context)
+#define ErrorContext			(GetMyThreadInfo->thr_error_context)
+#define errordata				(GetMyThreadInfo->thr_error_data)
+#define recursion_depth			(GetMyThreadInfo->thr_error_recursion_depth)
+#define errordata_stack_depth	(GetMyThreadInfo->thr_error_stack_depth)
+#define CritSectionCount		(GetMyThreadInfo->thr_criticalsec_count)
+
+#define PG_exception_stack		(GetMyThreadInfo->thr_sigjmp_buf)
+#define MyConnection			(GetMyThreadInfo->thr_conn)
+#define MyPort					((GetMyThreadInfo->thr_conn != NULL) ?	\
+									GetMyThreadInfo->thr_conn->con_port :	\
+									NULL)
+#define MyThreadID				(GetMyThreadInfo->thr_id)
+
+#define GTM_CachedTransInfo				(GetMyThreadInfo->thr_cached_txninfo)
+#define GTM_HaveFreeCachedTransInfo()	(list_length(GTM_CachedTransInfo))
+
+#define GTM_MAX_CACHED_TRANSINFO		0
+#define GTM_HaveEnoughCachedTransInfo()	(list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO)
+
+#define START_CRIT_SECTION()  (CritSectionCount++)
+
+#define END_CRIT_SECTION() \
+	do { \
+		    Assert(CritSectionCount > 0); \
+		    CritSectionCount--; \
+	} while(0)
+
+
+#if 0
+
+/* Coordinator registration */
+int GTM_RegisterCoordinator(GTM_CoordInfo *cinfo);
+int GTM_UnregisterCoordinator(GTM_CoordinatorId cid);
+
+#endif
+
+#endif
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
new file mode 100644
index 0000000000..1a04064b6d
--- /dev/null
+++ b/src/include/gtm/gtm_c.h
@@ -0,0 +1,101 @@
+/*-------------------------------------------------------------------------
+ *
+ * c.h
+ *	  Fundamental C definitions.  This is included by every .c file in
+ *	  PostgreSQL (via either postgres.h or postgres_fe.h, as appropriate).
+ *
+ *	  Note that the definitions here are not intended to be exposed to clients
+ *	  of the frontend interface libraries --- so we don't worry much about
+ *	  polluting the namespace with lots of stuff...
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/c.h,v 1.234 2009/01/01 17:23:55 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_C_H
+#define GTM_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdarg.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#include <sys/types.h>
+
+#include <errno.h>
+#include <pthread.h>
+#include "c.h"
+
+typedef uint32	GlobalTransactionId;		/* 32-bit global transaction ids */
+typedef uint32	PGXC_NodeId;
+typedef uint32	GTM_CoordinatorId;
+typedef int16	GTMProxy_ConnID;
+
+#define InvalidGTMProxyConnID	-1
+
+typedef pthread_t	GTM_ThreadID;
+
+/*
+ * A unique handle to identify transaction at the GTM. It could just be
+ * an index in an array or a pointer to the structure
+ *
+ * Note: If we get rid of BEGIN transaction at the GTM, we can use GXID
+ * as a handle because we would never have a transaction state at the
+ * GTM without assigned GXID.
+ */
+typedef int32	GTM_TransactionHandle; 
+
+#define InvalidTransactionHandle	-1
+
+typedef int64	GTM_Sequence;	/* a 64-bit sequence */
+typedef  struct GTM_SequenceKeyData
+{
+	uint32		gsk_keylen;
+	char		*gsk_key;
+} GTM_SequenceKeyData; 	/* Counter key, set by the client */
+
+typedef GTM_SequenceKeyData *GTM_SequenceKey;
+#define GTM_MAX_SEQKEY_LENGTH		1024
+
+#define InvalidSequenceValue				0x7fffffffffffffffLL
+#define SEQVAL_IS_VALID(v)					((v) != InvalidSequenceValue)
+
+#define GTM_MAX_GLOBAL_TRANSACTIONS	4096
+
+typedef enum GTM_IsolationLevel
+{
+	GTM_ISOLATION_SERIALIZABLE, /* serializable txn */
+	GTM_ISOLATION_RC		/* read-committed txn */
+} GTM_IsolationLevel;
+
+typedef struct GTM_SnapshotData
+{
+	GlobalTransactionId		sn_xmin;
+	GlobalTransactionId		sn_xmax;
+	GlobalTransactionId		sn_recent_global_xmin;
+	uint32					sn_xcnt;
+	GlobalTransactionId		*sn_xip;
+} GTM_SnapshotData;
+
+typedef GTM_SnapshotData *GTM_Snapshot;
+
+typedef struct GTM_StartupPacket {
+	GTM_CoordinatorId	sp_cid;
+	bool				sp_isproxy;
+} GTM_StartupPacket;
+
+#define InvalidGlobalTransactionId		((GlobalTransactionId) 0)
+
+#define GlobalTransactionIdIsValid(gxid) ((GlobalTransactionId) (gxid)) != InvalidGlobalTransactionId
+
+#define _(x) gettext(x)
+
+#endif   /* GTM_C_H */
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
new file mode 100644
index 0000000000..29eeaf95f9
--- /dev/null
+++ b/src/include/gtm/gtm_client.h
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_client.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_CLIENT_H
+#define GTM_CLIENT_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-fe.h"
+
+typedef union GTM_ResultData
+{
+	GTM_TransactionHandle	grd_txnhandle;	/* TXN_BEGIN */
+	GlobalTransactionId		grd_gxid;		/* TXN_BEGIN_GETGXID
+											 * TXN_PREPARE
+											 * TXN_COMMIT
+											 * TXN_ROLLBACK
+											 */
+	
+	struct
+	{
+		GTM_TransactionHandle	txnhandle;
+		GlobalTransactionId		gxid;
+	} grd_txn;								/* TXN_GET_GXID
+											 * SNAPSHOT_GET
+											 * SNAPSHOT_GXID_GET */
+
+	GTM_SequenceKeyData		grd_seqkey;		/* SEQUENCE_INIT
+											 * SEQUENCE_RESET
+											 * SEQUENCE_CLOSE */
+	struct
+	{
+		GTM_SequenceKeyData	seqkey;
+		GTM_Sequence		seqval;
+	} grd_seq;								/* SEQUENCE_GET_CURRENT
+											   SEQUENCE_GET_NEXT */
+
+	struct
+	{
+		int						txn_count; /* TXN_BEGIN_GETGXID_MULTI */
+		GlobalTransactionId		start_gxid;
+	} grd_txn_get_multi;
+
+	struct
+	{
+		int			txn_count;								/* TXN_COMMIT_MULTI */
+		int			status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	} grd_txn_rc_multi;
+
+	struct
+	{
+		int			txn_count;								/* GET_SNAPSHOT_MULTI */
+		int			status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	} grd_txn_snap_multi;
+
+	/*
+	 * TODO
+	 * 	TXN_GET_STATUS
+	 * 	TXN_GET_ALL_PREPARED
+	 */
+} GTM_ResultData;
+
+typedef struct GTM_Result
+{
+	GTM_ResultType		gr_type;
+	int					gr_msglen;
+	int					gr_status;
+	GTM_ProxyMsgHeader	gr_proxyhdr;
+	GTM_ResultData		gr_resdata;
+	/*
+	 * We keep these two items outside the union to avoid repeated malloc/free
+	 * of the xip array. If these items are pushed inside the union, they may
+	 * get overwritten by other members in the union
+	 */
+	int					gr_xip_size;
+	GTM_SnapshotData	gr_snapshot;
+
+	/*
+	 * Similarly, keep the buffer for proxying data outside the union
+	 */
+	char				*gr_proxy_data;
+	int					gr_proxy_datalen;
+} GTM_Result;
+
+/*
+ * Connection Management API
+ */
+GTM_Conn *connect_gtm(const char *connect_string);
+void disconnect_gtm(GTM_Conn *conn);
+
+/*
+ * Transaction Management API
+ */
+GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel);
+GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel);
+int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
+int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
+int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
+						int nodecnt, PGXC_NodeId nodes[]);
+
+/*
+ * Snapshot Management API
+ */
+GTM_SnapshotData *get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid,
+		bool canbe_grouped);
+
+/*
+ * Sequence Management API
+ */
+int open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
+				  GTM_Sequence minval, GTM_Sequence maxval,
+				  GTM_Sequence startval, bool cycle);
+int close_sequence(GTM_Conn *conn, GTM_SequenceKey key);
+GTM_Sequence get_current(GTM_Conn *conn, GTM_SequenceKey key);
+GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key);
+int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
+
+
+#endif
diff --git a/src/include/gtm/gtm_conn.h b/src/include/gtm/gtm_conn.h
new file mode 100644
index 0000000000..911a345c4f
--- /dev/null
+++ b/src/include/gtm/gtm_conn.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_conn.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_CONN_H
+#define GTM_CONN_H
+
+#include "gtm/libpq-be.h"
+
+struct GTM_ThreadInfo;
+
+typedef struct GTM_ConnectionInfo
+{
+	/* Port contains all the vital information about this connection */
+	Port					*con_port;
+	struct GTM_ThreadInfo	*con_thrinfo;
+	bool					con_authenticated;
+} GTM_ConnectionInfo;
+
+typedef struct GTM_Connections
+{
+	uint32				gc_conn_count;
+	uint32				gc_array_size;
+	GTM_ConnectionInfo	*gc_connections;
+	GTM_RWLock			gc_lock;
+} GTM_Connections;
+
+
+#endif
diff --git a/src/include/gtm/gtm_ext.h b/src/include/gtm/gtm_ext.h
new file mode 100644
index 0000000000..b492941779
--- /dev/null
+++ b/src/include/gtm/gtm_ext.h
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_ext.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_EXT_H
+#define GTM_EXT_H
+
+/*
+ * Identifiers of error message fields.  Kept here to keep common
+ * between frontend and backend, and also to export them to libpq
+ * applications.
+ */
+#define PG_DIAG_SEVERITY		'S'
+#define PG_DIAG_MESSAGE_PRIMARY 'M'
+#define PG_DIAG_MESSAGE_DETAIL	'D'
+#define PG_DIAG_MESSAGE_HINT	'H'
+#define PG_DIAG_SOURCE_FILE		'F'
+#define PG_DIAG_SOURCE_LINE		'L'
+#define PG_DIAG_SOURCE_FUNCTION 'R'
+
+
+#endif
diff --git a/src/include/gtm/gtm_ip.h b/src/include/gtm/gtm_ip.h
new file mode 100644
index 0000000000..30da3081d3
--- /dev/null
+++ b/src/include/gtm/gtm_ip.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.h
+ *	  Definitions for IPv6-aware network access.
+ *
+ * These definitions are used by both frontend and backend code.  Be careful
+ * what you include here!
+ *
+ * Copyright (c) 2003-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IP_H
+#define IP_H
+
+#include "gtm/pqcomm.h"
+
+
+extern int gtm_getaddrinfo_all(const char *hostname, const char *servname,
+				   const struct addrinfo * hintp,
+				   struct addrinfo ** result);
+extern void gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai);
+
+extern int gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+				   char *node, int nodelen,
+				   char *service, int servicelen,
+				   int flags);
+
+extern int gtm_range_sockaddr(const struct sockaddr_storage * addr,
+				  const struct sockaddr_storage * netaddr,
+				  const struct sockaddr_storage * netmask);
+
+extern int gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask,
+					  char *numbits, int family);
+
+#ifdef HAVE_IPV6
+extern void gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr);
+extern void gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr);
+#endif
+
+#ifdef	HAVE_UNIX_SOCKETS
+#define IS_AF_UNIX(fam) ((fam) == AF_UNIX)
+#else
+#define IS_AF_UNIX(fam) (0)
+#endif
+
+#endif   /* IP_H */
diff --git a/src/include/gtm/gtm_list.h b/src/include/gtm/gtm_list.h
new file mode 100644
index 0000000000..6a5727f36a
--- /dev/null
+++ b/src/include/gtm/gtm_list.h
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_list.h
+ *	  interface for PostgreSQL generic linked list package
+ *
+ * This package implements singly-linked homogeneous lists.
+ *
+ * It is important to have constant-time length, append, and prepend
+ * operations. To achieve this, we deal with two distinct data
+ * structures:
+ *
+ *		1. A set of "list cells": each cell contains a data field and
+ *		   a link to the next cell in the list or NULL.
+ *		2. A single structure containing metadata about the list: the
+ *		   type of the list, pointers to the head and tail cells, and
+ *		   the length of the list.
+ *
+ * We support three types of lists:
+ *
+ *	T_List: lists of pointers
+ *		(in practice usually pointers to Nodes, but not always;
+ *		declared as "void *" to minimize casting annoyances)
+ *	T_IntList: lists of integers
+ *	T_OidList: lists of Oids
+ *
+ * (At the moment, ints and Oids are the same size, but they may not
+ * always be so; try to be careful to maintain the distinction.)
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/nodes/pg_list.h,v 1.59 2008/08/14 18:48:00 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_LIST_H
+#define GTM_LIST_H
+
+
+typedef struct ListCell ListCell;
+
+typedef struct List
+{
+	int			length;
+	ListCell   *head;
+	ListCell   *tail;
+} List;
+
+struct ListCell
+{
+	union
+	{
+		void	   *ptr_value;
+		int			int_value;
+	}			data;
+	ListCell   *next;
+};
+
+/*
+ * The *only* valid representation of an empty list is NIL; in other
+ * words, a non-NIL list is guaranteed to have length >= 1 and
+ * head/tail != NULL
+ */
+#define NIL						((List *) NULL)
+
+/*
+ * These routines are used frequently. However, we can't implement
+ * them as macros, since we want to avoid double-evaluation of macro
+ * arguments. Therefore, we implement them using GCC inline functions,
+ * and as regular functions with non-GCC compilers.
+ */
+#ifdef __GNUC__
+
+static __inline__ ListCell *
+list_head(List *l)
+{
+	return l ? l->head : NULL;
+}
+
+static __inline__ ListCell *
+list_tail(List *l)
+{
+	return l ? l->tail : NULL;
+}
+
+static __inline__ int
+list_length(List *l)
+{
+	return l ? l->length : 0;
+}
+#else
+
+extern ListCell *list_head(List *l);
+extern ListCell *list_tail(List *l);
+extern int	list_length(List *l);
+#endif   /* __GNUC__ */
+
+/*
+ * NB: There is an unfortunate legacy from a previous incarnation of
+ * the List API: the macro lfirst() was used to mean "the data in this
+ * cons cell". To avoid changing every usage of lfirst(), that meaning
+ * has been kept. As a result, lfirst() takes a ListCell and returns
+ * the data it contains; to get the data in the first cell of a
+ * List, use linitial(). Worse, lsecond() is more closely related to
+ * linitial() than lfirst(): given a List, lsecond() returns the data
+ * in the second cons cell.
+ */
+
+#define lnext(lc)				((lc)->next)
+#define lfirst(lc)				((lc)->data.ptr_value)
+#define lfirst_int(lc)			((lc)->data.int_value)
+
+#define linitial(l)				lfirst(list_head(l))
+#define linitial_int(l)			lfirst_int(list_head(l))
+
+#define lsecond(l)				lfirst(lnext(list_head(l)))
+#define lsecond_int(l)			lfirst_int(lnext(list_head(l)))
+
+#define lthird(l)				lfirst(lnext(lnext(list_head(l))))
+#define lthird_int(l)			lfirst_int(lnext(lnext(list_head(l))))
+
+#define lfourth(l)				lfirst(lnext(lnext(lnext(list_head(l)))))
+#define lfourth_int(l)			lfirst_int(lnext(lnext(lnext(list_head(l)))))
+
+#define llast(l)				lfirst(list_tail(l))
+#define llast_int(l)			lfirst_int(list_tail(l))
+
+/*
+ * Convenience macros for building fixed-length lists
+ */
+#define list_make1(x1)				lcons(x1, NIL)
+#define list_make2(x1,x2)			lcons(x1, list_make1(x2))
+#define list_make3(x1,x2,x3)		lcons(x1, list_make2(x2, x3))
+#define list_make4(x1,x2,x3,x4)		lcons(x1, list_make3(x2, x3, x4))
+
+#define list_make1_int(x1)			lcons_int(x1, NIL)
+#define list_make2_int(x1,x2)		lcons_int(x1, list_make1_int(x2))
+#define list_make3_int(x1,x2,x3)	lcons_int(x1, list_make2_int(x2, x3))
+#define list_make4_int(x1,x2,x3,x4) lcons_int(x1, list_make3_int(x2, x3, x4))
+
+/*
+ * foreach -
+ *	  a convenience macro which loops through the list
+ */
+#define foreach(cell, l)	\
+	for ((cell) = list_head(l); (cell) != NULL; (cell) = lnext(cell))
+
+/*
+ * for_each_cell -
+ *	  a convenience macro which loops through a list starting from a
+ *	  specified cell
+ */
+#define for_each_cell(cell, initcell)	\
+	for ((cell) = (initcell); (cell) != NULL; (cell) = lnext(cell))
+
+/*
+ * forboth -
+ *	  a convenience macro for advancing through two linked lists
+ *	  simultaneously. This macro loops through both lists at the same
+ *	  time, stopping when either list runs out of elements. Depending
+ *	  on the requirements of the call site, it may also be wise to
+ *	  assert that the lengths of the two lists are equal.
+ */
+#define forboth(cell1, list1, cell2, list2)							\
+	for ((cell1) = list_head(list1), (cell2) = list_head(list2);	\
+		 (cell1) != NULL && (cell2) != NULL;						\
+		 (cell1) = lnext(cell1), (cell2) = lnext(cell2))
+
+extern List *lappend(List *list, void *datum);
+extern List *lappend_int(List *list, int datum);
+
+extern ListCell *lappend_cell(List *list, ListCell *prev, void *datum);
+extern ListCell *lappend_cell_int(List *list, ListCell *prev, int datum);
+
+extern List *lcons(void *datum, List *list);
+extern List *lcons_int(int datum, List *list);
+
+extern List *list_concat(List *list1, List *list2);
+extern List *list_truncate(List *list, int new_size);
+
+extern void *list_nth(List *list, int n);
+extern int	list_nth_int(List *list, int n);
+
+extern bool list_member(List *list, void *datum);
+extern bool list_member_ptr(List *list, void *datum);
+extern bool list_member_int(List *list, int datum);
+
+extern List *list_delete(List *list, void *datum);
+extern List *list_delete_ptr(List *list, void *datum);
+extern List *list_delete_int(List *list, int datum);
+extern List *list_delete_first(List *list);
+extern List *list_delete_cell(List *list, ListCell *cell, ListCell *prev);
+
+extern List *list_union(List *list1, List *list2);
+extern List *list_union_ptr(List *list1, List *list2);
+extern List *list_union_int(List *list1, List *list2);
+
+extern List *list_intersection(List *list1, List *list2);
+/* currently, there's no need for list_intersection_int etc */
+
+extern List *list_difference(List *list1, List *list2);
+extern List *list_difference_ptr(List *list1, List *list2);
+extern List *list_difference_int(List *list1, List *list2);
+
+extern List *list_append_unique(List *list, void *datum);
+extern List *list_append_unique_ptr(List *list, void *datum);
+extern List *list_append_unique_int(List *list, int datum);
+
+extern List *list_concat_unique(List *list1, List *list2);
+extern List *list_concat_unique_ptr(List *list1, List *list2);
+extern List *list_concat_unique_int(List *list1, List *list2);
+
+extern void list_free(List *list);
+extern void list_free_deep(List *list);
+
+extern List *list_copy(List *list);
+extern List *list_copy_tail(List *list, int nskip);
+
+/*
+ * To ease migration to the new list API, a set of compatibility
+ * macros are provided that reduce the impact of the list API changes
+ * as far as possible. Until client code has been rewritten to use the
+ * new list API, the ENABLE_LIST_COMPAT symbol can be defined before
+ * including pg_list.h
+ */
+#ifdef ENABLE_LIST_COMPAT
+
+#define lfirsti(lc)					lfirst_int(lc)
+
+#define makeList1(x1)				list_make1(x1)
+#define makeList2(x1, x2)			list_make2(x1, x2)
+#define makeList3(x1, x2, x3)		list_make3(x1, x2, x3)
+#define makeList4(x1, x2, x3, x4)	list_make4(x1, x2, x3, x4)
+
+#define makeListi1(x1)				list_make1_int(x1)
+#define makeListi2(x1, x2)			list_make2_int(x1, x2)
+
+#define lconsi(datum, list)			lcons_int(datum, list)
+
+#define lappendi(list, datum)		lappend_int(list, datum)
+
+#define nconc(l1, l2)				list_concat(l1, l2)
+
+#define nth(n, list)				list_nth(list, n)
+
+#define member(datum, list)			list_member(list, datum)
+#define ptrMember(datum, list)		list_member_ptr(list, datum)
+#define intMember(datum, list)		list_member_int(list, datum)
+
+/*
+ * Note that the old lremove() determined equality via pointer
+ * comparison, whereas the new list_delete() uses equal(); in order to
+ * keep the same behavior, we therefore need to map lremove() calls to
+ * list_delete_ptr() rather than list_delete()
+ */
+#define lremove(elem, list)			list_delete_ptr(list, elem)
+#define LispRemove(elem, list)		list_delete(list, elem)
+#define lremovei(elem, list)		list_delete_int(list, elem)
+
+#define ltruncate(n, list)			list_truncate(list, n)
+
+#define set_union(l1, l2)			list_union(l1, l2)
+#define set_ptrUnion(l1, l2)		list_union_ptr(l1, l2)
+
+#define set_difference(l1, l2)		list_difference(l1, l2)
+#define set_ptrDifference(l1, l2)	list_difference_ptr(l1, l2)
+
+#define equali(l1, l2)				equal(l1, l2)
+#define equalo(l1, l2)				equal(l1, l2)
+
+#define freeList(list)				list_free(list)
+
+#define listCopy(list)				list_copy(list)
+
+extern int	length(List *list);
+#endif   /* ENABLE_LIST_COMPAT */
+
+#endif   /* GTM_LIST_H */
diff --git a/src/include/gtm/gtm_lock.h b/src/include/gtm/gtm_lock.h
new file mode 100644
index 0000000000..f4a5e025ba
--- /dev/null
+++ b/src/include/gtm/gtm_lock.h
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_lock.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef GTM_LOCK_H
+#define GTM_LOCK_H
+
+#include <pthread.h>
+
+typedef struct GTM_RWLock
+{
+	pthread_rwlock_t lk_lock;
+} GTM_RWLock;
+
+typedef struct GTM_MutexLock
+{
+	pthread_mutex_t lk_lock;
+} GTM_MutexLock;
+
+typedef enum GTM_LockMode
+{
+	GTM_LOCKMODE_WRITE,
+	GTM_LOCKMODE_READ
+} GTM_LockMode;
+
+typedef struct GTM_CV
+{
+	pthread_cond_t	cv_condvar;
+} GTM_CV;
+
+extern bool GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode);
+extern bool GTM_RWLockRelease(GTM_RWLock *lock);
+extern int GTM_RWLockInit(GTM_RWLock *lock);
+extern int GTM_RWLockDestroy(GTM_RWLock *lock);
+extern bool GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode);
+
+extern bool GTM_MutexLockAcquire(GTM_MutexLock *lock);
+extern bool GTM_MutexLockRelease(GTM_MutexLock *lock);
+extern int GTM_MutexLockInit(GTM_MutexLock *lock);
+extern int GTM_MutexLockDestroy(GTM_MutexLock *lock);
+extern bool GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock);
+
+extern int GTM_CVInit(GTM_CV *cv);
+extern int GTM_CVDestroy(GTM_CV *cv);
+extern int GTM_CVSignal(GTM_CV *cv);
+extern int GTM_CVBcast(GTM_CV *cv);
+extern int GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock);
+
+#endif
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
new file mode 100644
index 0000000000..cae061437d
--- /dev/null
+++ b/src/include/gtm/gtm_msg.h
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_msg.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_MSG_H
+#define GTM_MSG_H
+
+typedef enum GTM_MessageType
+{
+	MSG_TYPE_INVALID,
+	MSG_REGISTER_COORD,		/* Register a Coordinator with GTM */
+	MSG_UNREGISTER_COORD,	/* Unregister a Coordinator with GTM */
+	MSG_TXN_BEGIN,			/* Start a new transaction */
+	MSG_TXN_BEGIN_GETGXID,	/* Start a new transaction and get GXID */
+	MSG_TXN_BEGIN_GETGXID_MULTI,	/* Start multiple new transactions and get GXIDs */
+	MSG_TXN_PREPARE,		/* Prepare a transation for commit */
+	MSG_TXN_COMMIT,			/* Commit a running or prepared transaction */
+	MSG_TXN_COMMIT_MULTI,	/* Commit multiple running or prepared transactions */
+	MSG_TXN_ROLLBACK,		/* Rollback a transaction */
+	MSG_TXN_ROLLBACK_MULTI,	/* Rollback multiple transactions */
+	MSG_TXN_GET_GXID,		/* Get a GXID for a transaction */
+	MSG_SNAPSHOT_GET,		/* Get a global snapshot */
+	MSG_SNAPSHOT_GET_MULTI,	/* Get multiple global snapshots */
+	MSG_SNAPSHOT_GXID_GET,	/* Get GXID and snapshot together */
+	MSG_SEQUENCE_INIT,		/* Initialize a new global sequence */
+	MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */
+	MSG_SEQUENCE_GET_NEXT,	/* Get the next sequence value of sequence */
+	MSG_SEQUENCE_RESET,		/* Reset the sequence */
+	MSG_SEQUENCE_CLOSE,		/* Close a previously inited sequence */
+	MSG_TXN_GET_STATUS,		/* Get status of a given transaction */
+	MSG_TXN_GET_ALL_PREPARED,	/* Get information about all outstanding
+						  		 * prepared transactions */
+	MSG_TXN_BEGIN_GETGXID_AUTOVACUUM,	/* Start a new transaction and get GXID for autovacuum */
+	MSG_DATA_FLUSH,					/* flush pending data */
+	MSG_BACKEND_DISCONNECT,			/* tell GTM that the backend diconnected from the proxy */
+
+	/*
+	 * Must be at the end
+	 */
+	MSG_TYPE_COUNT			/* A dummmy entry just to count the message types */
+} GTM_MessageType;
+
+typedef enum GTM_ResultType
+{
+	TXN_BEGIN_RESULT,
+	TXN_BEGIN_GETGXID_RESULT,
+	TXN_BEGIN_GETGXID_MULTI_RESULT,
+	TXN_PREPARE_RESULT,
+	TXN_COMMIT_RESULT,
+	TXN_COMMIT_MULTI_RESULT,
+	TXN_ROLLBACK_RESULT,
+	TXN_ROLLBACK_MULTI_RESULT,
+	TXN_GET_GXID_RESULT,
+	SNAPSHOT_GET_RESULT,
+	SNAPSHOT_GET_MULTI_RESULT,
+	SNAPSHOT_GXID_GET_RESULT,
+	SEQUENCE_INIT_RESULT,
+	SEQUENCE_GET_CURRENT_RESULT,
+	SEQUENCE_GET_NEXT_RESULT,
+	SEQUENCE_RESET_RESULT,
+	SEQUENCE_CLOSE_RESULT,
+	TXN_GET_STATUS_RESULT,
+	TXN_GET_ALL_PREPARED_RESULT,
+	TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT,
+} GTM_ResultType;
+
+/*
+ * Special message header for the messgaes exchanged between the GTM server and
+ * the proxy.
+ *
+ * ph_conid: connection identifier which is used to route 
+ * the messages to the right backend.
+ */
+typedef struct GTM_ProxyMsgHeader
+{
+	GTMProxy_ConnID	ph_conid;
+} GTM_ProxyMsgHeader;
+
+#endif
diff --git a/src/include/gtm/gtm_proxy.h b/src/include/gtm/gtm_proxy.h
new file mode 100644
index 0000000000..8dc16bca0e
--- /dev/null
+++ b/src/include/gtm/gtm_proxy.h
@@ -0,0 +1,221 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_proxy.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_PROXY_H
+#define _GTM_PROXY_H
+
+#include <setjmp.h>
+#include <poll.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_conn.h"
+#include "gtm/elog.h"
+#include "gtm/gtm_list.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-fe.h"
+
+extern char *GTMProxyLogFile;
+
+typedef enum GTMProxy_ThreadStatus
+{
+	GTM_PROXY_THREAD_STARTING,
+	GTM_PROXY_THREAD_RUNNING,
+	GTM_PROXY_THREAD_EXITING,
+	/* Must be the last */
+	GTM_PROXY_THREAD_INVALID
+} GTMProxy_ThreadStatus;
+
+typedef struct GTMProxy_ConnectionInfo
+{
+	/* Port contains all the vital information about this connection */
+	Port						*con_port;
+	struct GTMProxy_ThreadInfo	*con_thrinfo;
+	bool						con_authenticated;
+	bool						con_disconnected;
+	GTMProxy_ConnID				con_id;
+
+	GTM_MessageType				con_pending_msg;
+	GlobalTransactionId 		con_txid;
+	GTM_TransactionHandle		con_handle;
+} GTMProxy_ConnectionInfo;
+
+typedef struct GTMProxy_Connections
+{
+	uint32					gc_conn_count;
+	uint32					gc_array_size;
+	GTMProxy_ConnectionInfo	*gc_connections;
+	GTM_RWLock				gc_lock;
+} GTMProxy_Connections;
+
+#define ERRORDATA_STACK_SIZE  5
+#define GTM_PROXY_MAX_CONNECTIONS	1024
+
+typedef struct GTMProxy_ThreadInfo
+{
+	/*
+	 * Thread specific information such as connection(s) served by it
+	 */
+	GTM_ThreadID			thr_id;
+	uint32					thr_localid;
+	void * (* thr_startroutine)(void *);
+	
+	MemoryContext	thr_thread_context;
+	MemoryContext	thr_message_context;	
+	MemoryContext	thr_current_context;
+	MemoryContext	thr_error_context;
+	MemoryContext	thr_parent_context;
+
+	sigjmp_buf		*thr_sigjmp_buf;
+
+	ErrorData		thr_error_data[ERRORDATA_STACK_SIZE];
+	int				thr_error_stack_depth;
+	int				thr_error_recursion_depth;
+	int				thr_criticalsec_count;
+
+	GTMProxy_ThreadStatus	thr_status;
+	GTMProxy_ConnectionInfo	*thr_conn;	/* Current active */
+
+	/* 
+	 * The structure member type/sequence upto this point must match the
+	 * GTM_ThreadInfo structure in gtm.h since they are shared in some common
+	 * library routines such as elog.c. Keeping them in sync helps us use the
+	 * same library for the proxy as well as the server.
+	 */ 
+	GTM_MutexLock			thr_lock;
+	GTM_CV					thr_cv;
+	
+	/*
+	 * We use a sequence number to track the state of connection/fd array.
+	 * Whenever a new connection is added or an existing connection is deleted
+	 * from the connection array, the sequence number is incremented. The
+	 * thread main routine can then reconstruct the fd array again.
+	 */
+	int32					thr_seqno;
+
+	/* number of connections served by this thread */
+	uint32					thr_conn_count;
+
+	/* connection array */
+	GTMProxy_ConnectionInfo	*thr_all_conns[GTM_PROXY_MAX_CONNECTIONS];
+	struct pollfd			thr_poll_fds[GTM_PROXY_MAX_CONNECTIONS];
+	List 					*thr_processed_commands;
+	List 					*thr_pending_commands[MSG_TYPE_COUNT];
+
+	GTM_Conn				*thr_gtm_conn;
+
+} GTMProxy_ThreadInfo;
+
+typedef struct GTMProxy_Threads
+{
+	uint32					gt_thread_count;
+	uint32					gt_array_size;
+	uint32					gt_next_worker;
+	GTMProxy_ThreadInfo		**gt_threads;
+	GTM_RWLock				gt_lock;
+} GTMProxy_Threads;
+
+extern GTMProxy_Threads *GTMProxyThreads;
+
+int GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo);
+int GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo);
+int GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo);
+void GTMProxy_ThreadExit(void);
+
+extern GTMProxy_ThreadInfo *GTMProxy_ThreadCreate(void *(* startroutine)(void *));
+extern GTMProxy_ThreadInfo * GTMProxy_GetThreadInfo(GTM_ThreadID thrid);
+extern GTMProxy_ThreadInfo *GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo);
+extern int GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo,
+		GTMProxy_ConnectionInfo *conninfo);
+
+/*
+ * Command data - the only relevant information right now is the XID
+ */
+typedef union GTMProxy_CommandData
+{
+	struct
+	{
+		bool					rdonly;
+		GTM_IsolationLevel		iso_level;
+	} cd_beg;
+
+	struct
+	{
+		bool					isgxid;
+		GlobalTransactionId		gxid;
+		GTM_TransactionHandle	handle;
+	} cd_rc;
+
+	struct
+	{
+		bool					isgxid;
+		GlobalTransactionId		gxid;
+		GTM_TransactionHandle	handle;
+	} cd_snap;
+} GTMProxy_CommandData;
+
+/*
+ * Structures to be used for message proxing. There will be one such entry for
+ * each pending command from a backend. To keep it simple, we have a separate
+ * entry even if the commands are grouped together.
+ *
+ * An array of these entries is maintained which is sorted by the order in
+ * which the commands are sent to the GTM server. We expect the GTM server to
+ * respond back in the same order and the sorted array helps us in
+ * matching/confirming the responses.
+ */
+typedef struct GTMProxy_CommandInfo
+{
+	GTM_MessageType			ci_mtype;
+	int						ci_res_index;
+	GTMProxy_CommandData	ci_data;
+	GTMProxy_ConnectionInfo	*ci_conn;
+} GTMProxy_CommandInfo;
+
+/*
+ * pthread keys to get thread specific information
+ */
+extern pthread_key_t					threadinfo_key;
+extern MemoryContext					TopMostMemoryContext;
+extern char								*GTMLogFile;
+
+#define SetMyThreadInfo(thrinfo)		pthread_setspecific(threadinfo_key, (thrinfo))
+#define GetMyThreadInfo					((GTMProxy_ThreadInfo *)pthread_getspecific(threadinfo_key))
+
+#define TopMemoryContext		(GetMyThreadInfo->thr_thread_context)
+#define ThreadTopContext		(GetMyThreadInfo->thr_thread_context)
+#define MessageContext			(GetMyThreadInfo->thr_message_context)
+#define CurrentMemoryContext	(GetMyThreadInfo->thr_current_context)
+#define ErrorContext			(GetMyThreadInfo->thr_error_context)
+#define errordata				(GetMyThreadInfo->thr_error_data)
+#define recursion_depth			(GetMyThreadInfo->thr_error_recursion_depth)
+#define errordata_stack_depth	(GetMyThreadInfo->thr_error_stack_depth)
+#define CritSectionCount		(GetMyThreadInfo->thr_criticalsec_count)
+
+#define PG_exception_stack		(GetMyThreadInfo->thr_sigjmp_buf)
+#define MyConnection			(GetMyThreadInfo->thr_conn)
+#define MyPort					((GetMyThreadInfo->thr_conn != NULL) ?	\
+									GetMyThreadInfo->thr_conn->con_port :	\
+									NULL)
+#define MyThreadID				(GetMyThreadInfo->thr_id)
+
+#define START_CRIT_SECTION()  (CritSectionCount++)
+
+#define END_CRIT_SECTION() \
+	do { \
+		    Assert(CritSectionCount > 0); \
+		    CritSectionCount--; \
+	} while(0)
+
+#endif
diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h
new file mode 100644
index 0000000000..6cb8cb3027
--- /dev/null
+++ b/src/include/gtm/gtm_seq.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_seq.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GTM_SEQ_H
+#define GTM_SEQ_H
+
+#include "gtm/stringinfo.h"
+
+/* Global sequence  related structures */
+
+typedef struct GTM_SeqInfo
+{
+	GTM_SequenceKey	gs_key;
+	GTM_Sequence	gs_value;
+	GTM_Sequence	gs_init_value;
+	GTM_Sequence	gs_increment_by;
+	GTM_Sequence	gs_min_value;
+	GTM_Sequence	gs_max_value;
+	bool			gs_cycle;
+	bool			gs_called;
+
+	int32			gs_ref_count;
+	int32			gs_state;
+	GTM_RWLock		gs_lock;
+} GTM_SeqInfo;
+
+#define SEQ_STATE_ACTIVE	1
+#define SEQ_STATE_DELETED	2
+
+#define SEQ_IS_ASCENDING(s)		((s)->gs_increment_by > 0)
+#define SEQ_IS_CYCLE(s)		((s)->gs_cycle)
+#define SEQ_IS_CALLED(s)	((s)->gs_called)
+
+#define SEQ_DEF_MAX_SEQVAL_ASCEND			0x7ffffffffffffffeLL
+#define SEQ_DEF_MIN_SEQVAL_ASCEND			0x1
+
+#define SEQ_DEF_MAX_SEQVAL_DESCEND			-0x1
+#define SEQ_DEF_MIN_SEQVAL_DESCEND			-0x7ffffffffffffffeLL
+
+#define SEQ_MAX_REFCOUNT		1024
+
+/* SEQUENCE Management */
+void GTM_InitSeqManager(void);
+int GTM_SeqOpen(GTM_SequenceKey seqkey,
+			GTM_Sequence increment_by,
+			GTM_Sequence minval,
+			GTM_Sequence maxval,
+			GTM_Sequence startval,
+			bool cycle);
+int GTM_SeqClose(GTM_SequenceKey sqkey);
+GTM_Sequence GTM_SeqGetNext(GTM_SequenceKey seqkey);
+GTM_Sequence GTM_SeqGetCurrent(GTM_SequenceKey seqkey);
+int GTM_SeqReset(GTM_SequenceKey seqkey);
+
+
+void ProcessSequenceInitCommand(Port *myport, StringInfo message);
+void ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message);
+void ProcessSequenceGetNextCommand(Port *myport, StringInfo message);
+void ProcessSequenceResetCommand(Port *myport, StringInfo message);
+void ProcessSequenceCloseCommand(Port *myport, StringInfo message);
+
+void GTM_SaveSeqInfo(int ctlfd);
+void GTM_RestoreSeqInfo(int ctlfd);
+
+#endif
diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h
new file mode 100644
index 0000000000..2d789463f7
--- /dev/null
+++ b/src/include/gtm/gtm_txn.h
@@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_txn.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_TXN_H
+#define _GTM_TXN_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_list.h"
+#include "gtm/stringinfo.h"
+
+/* ----------------
+ *		Special transaction ID values
+ *
+ * BootstrapGlobalTransactionId is the XID for "bootstrap" operations, and
+ * FrozenGlobalTransactionId is used for very old tuples.  Both should
+ * always be considered valid.
+ *
+ * FirstNormalGlobalTransactionId is the first "normal" transaction id.
+ * Note: if you need to change it, you must change pg_class.h as well.
+ * ----------------
+ */
+#define BootstrapGlobalTransactionId		((GlobalTransactionId) 1)
+#define FrozenGlobalTransactionId			((GlobalTransactionId) 2)
+#define FirstNormalGlobalTransactionId	((GlobalTransactionId) 3)
+#define MaxGlobalTransactionId			((GlobalTransactionId) 0xFFFFFFFF)
+
+/* ----------------
+ *		transaction ID manipulation macros
+ * ----------------
+ */
+#define GlobalTransactionIdIsNormal(xid)		((xid) >= FirstNormalGlobalTransactionId)
+#define GlobalTransactionIdEquals(id1, id2)	((id1) == (id2))
+#define GlobalTransactionIdStore(xid, dest)	(*(dest) = (xid))
+#define StoreInvalidGlobalTransactionId(dest) (*(dest) = InvalidGlobalTransactionId)
+
+/* advance a transaction ID variable, handling wraparound correctly */
+#define GlobalTransactionIdAdvance(dest)	\
+	do { \
+		(dest)++; \
+		if ((dest) < FirstNormalGlobalTransactionId) \
+			(dest) = FirstNormalGlobalTransactionId; \
+	} while(0)
+
+/* back up a transaction ID variable, handling wraparound correctly */
+#define GlobalTransactionIdRetreat(dest)	\
+	do { \
+		(dest)--; \
+	} while ((dest) < FirstNormalGlobalTransactionId)
+
+typedef int XidStatus;
+
+#define TRANSACTION_STATUS_IN_PROGRESS      0x00
+#define TRANSACTION_STATUS_COMMITTED        0x01
+#define TRANSACTION_STATUS_ABORTED          0x02
+
+/*
+ * prototypes for functions in transam/transam.c
+ */
+extern bool GlobalTransactionIdDidCommit(GlobalTransactionId transactionId);
+extern bool GlobalTransactionIdDidAbort(GlobalTransactionId transactionId);
+extern void GlobalTransactionIdAbort(GlobalTransactionId transactionId);
+extern bool GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2);
+extern bool GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2);
+
+/* in transam/varsup.c */
+extern GlobalTransactionId GTM_GetGlobalTransactionId(GTM_TransactionHandle handle);
+extern GlobalTransactionId GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count);
+extern GlobalTransactionId ReadNewGlobalTransactionId(void);
+extern void SetGlobalTransactionIdLimit(GlobalTransactionId oldest_datfrozenxid);
+extern void SetNextGlobalTransactionId(GlobalTransactionId gxid);
+extern void GTM_SetShuttingDown(void);
+
+typedef enum GTM_States
+{
+	GTM_STARTING,
+	GTM_RUNNING,
+	GTM_SHUTTING_DOWN
+} GTM_States;
+
+/* Global transaction states at the GTM */
+typedef enum GTM_TransactionStates
+{
+	GTM_TXN_STARTING,
+	GTM_TXN_IN_PROGRESS,
+	GTM_TXN_PREPARE_IN_PROGRESS,
+	GTM_TXN_PREPARED,
+	GTM_TXN_COMMIT_IN_PROGRESS,
+	GTM_TXN_COMMITTED,
+	GTM_TXN_ABORT_IN_PROGRESS,
+	GTM_TXN_ABORTED
+} GTM_TransactionStates;
+
+typedef struct GTM_TransactionInfo
+{
+	GTM_TransactionHandle		gti_handle;
+	GTM_ThreadID				gti_thread_id;
+
+	bool						gti_in_use;
+	GlobalTransactionId			gti_gxid;
+	GTM_TransactionStates		gti_state;
+	PGXC_NodeId					gti_coordid;
+	GlobalTransactionId			gti_xmin;
+	GTM_IsolationLevel			gti_isolevel;
+	bool						gti_readonly;
+	GTMProxy_ConnID				gti_backend_id;
+	uint32						gti_nodecount;
+	PGXC_NodeId					*gti_nodes;
+
+	GTM_SnapshotData			gti_current_snapshot;
+	bool						gti_snapshot_set;
+
+	GTM_RWLock					gti_lock;
+	bool						gti_vacuum;
+} GTM_TransactionInfo;
+
+#define GTM_MAX_2PC_NODES				16
+#define GTM_CheckTransactionHandle(x)	((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS)
+#define GTM_IsTransSerializable(x)		((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE)
+
+typedef struct GTM_Transactions
+{
+	uint32				gt_txn_count;
+	GTM_States			gt_gtm_state;
+
+	GTM_RWLock			gt_XidGenLock;
+
+	/*
+	 * These fields are protected by XidGenLock
+	 */
+	GlobalTransactionId gt_nextXid;		/* next XID to assign */
+
+	GlobalTransactionId gt_oldestXid;	/* cluster-wide minimum datfrozenxid */
+	GlobalTransactionId gt_xidVacLimit;	/* start forcing autovacuums here */
+	GlobalTransactionId gt_xidWarnLimit; /* start complaining here */
+	GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */
+	GlobalTransactionId gt_xidWrapLimit; /* where the world ends */
+
+	/*
+	 * These fields are protected by TransArrayLock.
+	 */
+	GlobalTransactionId gt_latestCompletedXid;	/* newest XID that has committed or
+										 		 * aborted */
+
+	GlobalTransactionId	gt_recent_global_xmin;
+
+	int32				gt_lastslot;
+	GTM_TransactionInfo	gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS];
+	List				*gt_open_transactions;
+	
+	GTM_RWLock			gt_TransArrayLock;
+} GTM_Transactions;
+
+extern GTM_Transactions	GTMTransactions;
+
+#define GTM_CountOpenTransactions()		(list_length(GTMTransactions.gt_open_transactions))
+
+/*
+ * Two hash tables will be maintained to quickly find the
+ * GTM_TransactionInfo block given either the GXID or the GTM_TransactionHandle.
+ */
+
+GTM_TransactionInfo *GTM_HandleToTransactionInfo(GTM_TransactionHandle handle);
+GTM_TransactionHandle GTM_GXIDToHandle(GlobalTransactionId gxid);
+
+/* Transaction Control */
+void GTM_InitTxnManager(void);
+GTM_TransactionHandle GTM_BeginTransaction(GTM_CoordinatorId coord_id,
+										   GTM_IsolationLevel isolevel,
+										   bool readonly);
+int GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id,
+										   GTM_IsolationLevel isolevel[],
+										   bool readonly[],
+										   GTMProxy_ConnID connid[],
+										   int txn_count,
+										   GTM_TransactionHandle txns[]);
+int GTM_RollbackTransaction(GTM_TransactionHandle txn);
+int GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]);
+int GTM_RollbackTransactionGXID(GlobalTransactionId gxid);
+int GTM_CommitTransaction(GTM_TransactionHandle txn);
+int GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]);
+int GTM_CommitTransactionGXID(GlobalTransactionId gxid);
+int GTM_PrepareTransaction(GTM_TransactionHandle txn,
+						   uint32 nodecnt,
+						   PGXC_NodeId nodes[]);
+int GTM_PrepareTransactionGXID(GlobalTransactionId gxid,
+						   uint32 nodecnt,
+						   PGXC_NodeId nodes[]);
+uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt);
+GTM_TransactionStates GTM_GetStatus(GTM_TransactionHandle txn);
+GTM_TransactionStates GTM_GetStatusGXID(GlobalTransactionId gxid);
+int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt);
+void GTM_RemoveAllTransInfos(int backend_id);
+
+GTM_Snapshot GTM_GetSnapshotData(GTM_TransactionInfo *my_txninfo,
+								 GTM_Snapshot snapshot);
+GTM_Snapshot GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[],
+		int txn_count, int *status);
+void GTM_FreeCachedTransInfo(void);
+
+void ProcessBeginTransactionCommand(Port *myport, StringInfo message);
+void ProcessBeginTransactionCommandMulti(Port *myport, StringInfo message);
+void ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message);
+void ProcessCommitTransactionCommand(Port *myport, StringInfo message);
+void ProcessRollbackTransactionCommand(Port *myport, StringInfo message);
+void ProcessPrepareTransactionCommand(Port *myport, StringInfo message);
+void ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message);
+
+void ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message);
+void ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message);
+void ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message);
+void ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) ;
+
+void GTM_SaveTxnInfo(int ctlfd);
+void GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid);
+
+/*
+ * In gtm_snap.c
+ */
+void ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid);
+void ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message);
+void GTM_FreeSnapshotData(GTM_Snapshot snapshot);
+#endif
diff --git a/src/include/gtm/ip.h b/src/include/gtm/ip.h
new file mode 100644
index 0000000000..c5d975298b
--- /dev/null
+++ b/src/include/gtm/ip.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * ip.h
+ *	  Definitions for IPv6-aware network access.
+ *
+ * These definitions are used by both frontend and backend code.  Be careful
+ * what you include here!
+ *
+ * Copyright (c) 2003-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IP_H
+#define IP_H
+
+#include "gtm/pqcomm.h"
+
+
+extern int pg_getaddrinfo_all(const char *hostname, const char *servname,
+				   const struct addrinfo * hintp,
+				   struct addrinfo ** result);
+extern void pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai);
+
+extern int pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
+				   char *node, int nodelen,
+				   char *service, int servicelen,
+				   int flags);
+
+extern int pg_range_sockaddr(const struct sockaddr_storage * addr,
+				  const struct sockaddr_storage * netaddr,
+				  const struct sockaddr_storage * netmask);
+
+extern int pg_sockaddr_cidr_mask(struct sockaddr_storage * mask,
+					  char *numbits, int family);
+
+#ifdef HAVE_IPV6
+extern void pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr);
+extern void pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr);
+#endif
+
+#ifdef	HAVE_UNIX_SOCKETS
+#define IS_AF_UNIX(fam) ((fam) == AF_UNIX)
+#else
+#define IS_AF_UNIX(fam) (0)
+#endif
+
+#endif   /* IP_H */
diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h
new file mode 100644
index 0000000000..0a795def67
--- /dev/null
+++ b/src/include/gtm/libpq-be.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq_be.h
+ *	  This file contains definitions for structures and externs used
+ *	  by the postmaster during client authentication.
+ *
+ *	  Note that this is backend-internal and is NOT exported to clients.
+ *	  Structs that need to be client-visible are in pqcomm.h.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/libpq-be.h,v 1.69 2009/01/01 17:23:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LIBPQ_BE_H
+#define LIBPQ_BE_H
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+
+#include "gtm/pqcomm.h"
+
+/*
+ * This is used by the postmaster in its communication with frontends.	It
+ * contains all state information needed during this communication before the
+ * backend is run.	The Port structure is kept in malloc'd memory and is
+ * still available when a backend is running (see MyProcPort).	The data
+ * it points to must also be malloc'd, or else palloc'd in TopMostMemoryContext,
+ * so that it survives into GTM_ThreadMain execution!
+ */
+
+typedef struct Port
+{
+	int			sock;			/* File descriptor */
+	SockAddr	laddr;			/* local addr (postmaster) */
+	SockAddr	raddr;			/* remote addr (client) */
+	char	   *remote_host;	/* name (or ip addr) of remote host */
+	char	   *remote_port;	/* text rep of remote port */
+
+	GTMProxy_ConnID	conn_id;	/* RequestID of this command */
+
+	GTM_CoordinatorId	coordinator_id; /* Coordinator ID */
+	bool				is_proxy;		/* Is this a connection from GTM proxy ? */
+#define PQ_BUFFER_SIZE 8192
+
+	char 	PqSendBuffer[PQ_BUFFER_SIZE];
+	int		PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+
+	char 	PqRecvBuffer[PQ_BUFFER_SIZE];
+	int		PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
+	int		PqRecvLength;		/* End of data available in PqRecvBuffer */
+
+	/*
+	 * TCP keepalive settings.
+	 *
+	 * default values are 0 if AF_UNIX or not yet known; current values are 0
+	 * if AF_UNIX or using the default. Also, -1 in a default value means we
+	 * were unable to find out the default (getsockopt failed).
+	 */
+	int			default_keepalives_idle;
+	int			default_keepalives_interval;
+	int			default_keepalives_count;
+	int			keepalives_idle;
+	int			keepalives_interval;
+	int			keepalives_count;
+} Port;
+
+/* TCP keepalives configuration. These are no-ops on an AF_UNIX socket. */
+
+extern int	pq_getkeepalivesidle(Port *port);
+extern int	pq_getkeepalivesinterval(Port *port);
+extern int	pq_getkeepalivescount(Port *port);
+
+extern int	pq_setkeepalivesidle(int idle, Port *port);
+extern int	pq_setkeepalivesinterval(int interval, Port *port);
+extern int	pq_setkeepalivescount(int count, Port *port);
+
+#endif   /* LIBPQ_BE_H */
diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h
new file mode 100644
index 0000000000..2c5c2c4e04
--- /dev/null
+++ b/src/include/gtm/libpq-fe.h
@@ -0,0 +1,138 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq-fe.h
+ *	  This file contains definitions for structures and
+ *	  externs for functions used by frontend postgres applications.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-fe.h,v 1.145 2009/01/01 17:24:03 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef LIBPQ_FE_H
+#define LIBPQ_FE_H
+
+#ifdef __cplusplus
+extern		"C"
+{
+#endif
+
+#include <stdio.h>
+
+/*
+ * postgres_ext.h defines the backend's externally visible types,
+ * such as Oid.
+ */
+#include "gtm/gtm_ext.h"
+
+/*
+ * Option flags for PQcopyResult
+ */
+#define PG_COPYRES_ATTRS          0x01
+#define PG_COPYRES_TUPLES         0x02		/* Implies PG_COPYRES_ATTRS */
+#define PG_COPYRES_EVENTS         0x04
+#define PG_COPYRES_NOTICEHOOKS    0x08
+
+/* Application-visible enum types */
+
+typedef enum
+{
+	/*
+	 * Although it is okay to add to this list, values which become unused
+	 * should never be removed, nor should constants be redefined - that would
+	 * break compatibility with existing code.
+	 */
+	CONNECTION_OK,
+	CONNECTION_BAD,
+	/* Non-blocking mode only below here */
+
+	/*
+	 * The existence of these should never be relied upon - they should only
+	 * be used for user feedback or similar purposes.
+	 */
+	CONNECTION_STARTED,			/* Waiting for connection to be made.  */
+	CONNECTION_MADE,			/* Connection OK; waiting to send.	   */
+	CONNECTION_AWAITING_RESPONSE,		/* Waiting for a response from the
+										 * postmaster.		  */
+	CONNECTION_AUTH_OK,			/* Received authentication; waiting for
+								 * backend startup. */
+	CONNECTION_SETENV,			/* Negotiating environment. */
+	CONNECTION_SSL_STARTUP,		/* Negotiating SSL. */
+	CONNECTION_NEEDED			/* Internal state: connect() needed */
+} ConnStatusType;
+
+typedef enum
+{
+	PGRES_POLLING_FAILED = 0,
+	PGRES_POLLING_READING,		/* These two indicate that one may	  */
+	PGRES_POLLING_WRITING,		/* use select before polling again.   */
+	PGRES_POLLING_OK,
+	PGRES_POLLING_ACTIVE		/* unused; keep for awhile for backwards
+								 * compatibility */
+} GTMClientPollingStatusType;
+
+/* ----------------
+ * Structure for the conninfo parameter definitions returned by PQconndefaults
+ * or GTMPQconninfoParse.
+ *
+ * All fields except "val" point at static strings which must not be altered.
+ * "val" is either NULL or a malloc'd current-value string.  GTMPQconninfoFree()
+ * will release both the val strings and the GTMPQconninfoOption array itself.
+ * ----------------
+ */
+typedef struct _GTMPQconninfoOption
+{
+	char	   *keyword;		/* The keyword of the option			*/
+	char	   *val;			/* Option's current value, or NULL		 */
+} GTMPQconninfoOption;
+
+typedef struct gtm_conn GTM_Conn;
+
+/* ----------------
+ * Exported functions of libpq
+ * ----------------
+ */
+
+/* ===	in fe-connect.c === */
+
+/* make a new client connection to the backend */
+/* Asynchronous (non-blocking) */
+extern GTM_Conn *PQconnectGTMStart(const char *conninfo);
+extern GTMClientPollingStatusType GTMPQconnectPoll(GTM_Conn *conn);
+
+/* Synchronous (blocking) */
+extern GTM_Conn *PQconnectGTM(const char *conninfo);
+
+/* close the current connection and free the GTM_Conn data structure */
+extern void GTMPQfinish(GTM_Conn *conn);
+
+/* parse connection options in same way as PQconnectGTM */
+extern GTMPQconninfoOption *GTMPQconninfoParse(const char *conninfo, char **errmsg);
+
+/* free the data structure returned by PQconndefaults() or GTMPQconninfoParse() */
+extern void GTMPQconninfoFree(GTMPQconninfoOption *connOptions);
+
+extern char *GTMPQhost(const GTM_Conn *conn);
+extern char *GTMPQport(const GTM_Conn *conn);
+extern ConnStatusType GTMPQstatus(const GTM_Conn *conn);
+extern char *GTMPQerrorMessage(const GTM_Conn *conn);
+extern int	GTMPQsocket(const GTM_Conn *conn);
+
+/* Enable/disable tracing */
+extern void GTMPQtrace(GTM_Conn *conn, FILE *debug_port);
+extern void GTMPQuntrace(GTM_Conn *conn);
+
+/* Force the write buffer to be written (or at least try) */
+extern int	PQflush(GTM_Conn *conn);
+
+#define libpq_gettext(x)	x
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif   /* LIBPQ_FE_H */
diff --git a/src/include/gtm/libpq-int.h b/src/include/gtm/libpq-int.h
new file mode 100644
index 0000000000..5956de8ff2
--- /dev/null
+++ b/src/include/gtm/libpq-int.h
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq-int.h
+ *	  This file contains internal definitions meant to be used only by
+ *	  the frontend libpq library, not by applications that call it.
+ *
+ *	  An application can include this file if it wants to bypass the
+ *	  official API defined by libpq-fe.h, but code that does so is much
+ *	  more likely to break across PostgreSQL releases than code that uses
+ *	  only the official API.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-int.h,v 1.139 2009/01/01 17:24:03 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef LIBPQ_INT_H
+#define LIBPQ_INT_H
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include "gtm/pqcomm.h"
+#include "gtm/pqexpbuffer.h"
+#include "gtm/gtm_client.h"
+
+/*
+ * GTM_Conn stores all the state data associated with a single connection
+ * to a backend.
+ */
+struct gtm_conn
+{
+	/* Saved values of connection options */
+	char	   *pghost;			/* the machine on which the server is running */
+	char	   *pghostaddr;		/* the IPv4 address of the machine on which
+								 * the server is running, in IPv4
+								 * numbers-and-dots notation. Takes precedence
+								 * over above. */
+	char	   *pgport;			/* the server's communication port */
+	char	   *connect_timeout;	/* connection timeout (numeric string) */
+	char	   *coordinator_id;	/* coordinator id */
+	int			is_proxy;		/* is this a connection to/from a proxy ? */
+
+	/* Optional file to write trace info to */
+	FILE	   *Pfdebug;
+
+	/* Status indicators */
+	ConnStatusType status;
+
+	/* Connection data */
+	int			sock;			/* Unix FD for socket, -1 if not connected */
+	SockAddr	laddr;			/* Local address */
+	SockAddr	raddr;			/* Remote address */
+
+	/* Transient state needed while establishing connection */
+	struct addrinfo *addrlist;	/* list of possible backend addresses */
+	struct addrinfo *addr_cur;	/* the one currently being tried */
+	int			addrlist_family;	/* needed to know how to free addrlist */
+
+	/* Buffer for data received from backend and not yet processed */
+	char	   *inBuffer;		/* currently allocated buffer */
+	int			inBufSize;		/* allocated size of buffer */
+	int			inStart;		/* offset to first unconsumed data in buffer */
+	int			inCursor;		/* next byte to tentatively consume */
+	int			inEnd;			/* offset to first position after avail data */
+
+	/* Buffer for data not yet sent to backend */
+	char	   *outBuffer;		/* currently allocated buffer */
+	int			outBufSize;		/* allocated size of buffer */
+	int			outCount;		/* number of chars waiting in buffer */
+
+	/* State for constructing messages in outBuffer */
+	int			outMsgStart;	/* offset to msg start (length word); if -1,
+								 * msg has no length word */
+	int			outMsgEnd;		/* offset to msg end (so far) */
+
+	/* Buffer for current error message */
+	PQExpBufferData errorMessage;		/* expansible string */
+
+	/* Buffer for receiving various parts of messages */
+	PQExpBufferData workBuffer; /* expansible string */
+
+	/* Pointer to the result of last operation */
+	GTM_Result	*result;
+};
+
+/* === in fe-misc.c === */
+
+ /*
+  * "Get" and "Put" routines return 0 if successful, EOF if not. Note that for
+  * Get, EOF merely means the buffer is exhausted, not that there is
+  * necessarily any error.
+  */
+extern int	gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn);
+extern int	gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn);
+extern int	gtmpqGetc(char *result, GTM_Conn *conn);
+extern int	gtmpqPutc(char c, GTM_Conn *conn);
+extern int	gtmpqGets(PQExpBuffer buf, GTM_Conn *conn);
+extern int	gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn);
+extern int	gtmpqPuts(const char *s, GTM_Conn *conn);
+extern int	gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn);
+extern int	gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn);
+extern int	gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn);
+extern int	gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn);
+extern int	gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn);
+extern int	gtmpqPutMsgEnd(GTM_Conn *conn);
+extern int	gtmpqReadData(GTM_Conn *conn);
+extern int	gtmpqFlush(GTM_Conn *conn);
+extern int	gtmpqWait(int forRead, int forWrite, GTM_Conn *conn);
+extern int	gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn,
+			time_t finish_time);
+extern int	gtmpqReadReady(GTM_Conn *conn);
+extern int	gtmpqWriteReady(GTM_Conn *conn);
+
+/*
+ * In fe-protocol.c
+ */
+GTM_Result * GTMPQgetResult(GTM_Conn *conn);
+extern int gtmpqGetError(GTM_Conn *conn, GTM_Result *result);
+void gtmpqFreeResultData(GTM_Result *result, bool is_proxy);
+
+#define SOCK_ERRNO errno
+#define SOCK_ERRNO_SET(e) (errno = (e))
+
+#endif   /* LIBPQ_INT_H */
diff --git a/src/include/gtm/libpq.h b/src/include/gtm/libpq.h
new file mode 100644
index 0000000000..29621a43c4
--- /dev/null
+++ b/src/include/gtm/libpq.h
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpq.h
+ *	  POSTGRES LIBPQ buffer structure definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/libpq.h,v 1.70 2008/11/20 09:29:36 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LIBPQ_H
+#define LIBPQ_H
+
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "gtm/stringinfo.h"
+#include "gtm/libpq-be.h"
+
+/*
+ * External functions.
+ */
+
+/*
+ * prototypes for functions in pqcomm.c
+ */
+extern int StreamServerPort(int family, char *hostName,
+		 unsigned short portNumber, int ListenSocket[],
+				 int MaxListen);
+extern int	StreamConnection(int server_fd, Port *port);
+extern void StreamClose(int sock);
+extern void TouchSocketFile(void);
+extern void pq_comm_reset(void);
+extern int	pq_getbytes(Port *myport, char *s, size_t len);
+extern int	pq_getstring(Port *myport, StringInfo s);
+extern int	pq_getmessage(Port *myport, StringInfo s, int maxlen);
+extern int	pq_getbyte(Port *myport);
+extern int	pq_peekbyte(Port *myport);
+extern int	pq_putbytes(Port *myport, const char *s, size_t len);
+extern int	pq_flush(Port *myport);
+extern int	pq_putmessage(Port *myport, char msgtype, const char *s, size_t len);
+
+#endif   /* LIBPQ_H */
diff --git a/src/include/gtm/memnodes.h b/src/include/gtm/memnodes.h
new file mode 100644
index 0000000000..dea51b2bbd
--- /dev/null
+++ b/src/include/gtm/memnodes.h
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * memnodes.h
+ *	  POSTGRES memory context node definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/nodes/memnodes.h,v 1.34 2008/01/01 19:45:58 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MEMNODES_H
+#define MEMNODES_H
+
+#include "gtm/gtm_lock.h"
+
+/*
+ * MemoryContext
+ *		A logical context in which memory allocations occur.
+ *
+ * MemoryContext itself is an abstract type that can have multiple
+ * implementations, though for now we have only AllocSetContext.
+ * The function pointers in MemoryContextMethods define one specific
+ * implementation of MemoryContext --- they are a virtual function table
+ * in C++ terms.
+ *
+ * Note: for largely historical reasons, typedef MemoryContext is a pointer
+ * to the context struct rather than the struct type itself.
+ */
+
+typedef struct MemoryContextMethods
+{
+	void	   *(*alloc) (MemoryContext context, Size size);
+	/* call this free_p in case someone #define's free() */
+	void		(*free_p) (MemoryContext context, void *pointer);
+	void	   *(*realloc) (MemoryContext context, void *pointer, Size size);
+	void		(*init) (MemoryContext context);
+	void		(*reset) (MemoryContext context);
+	void		(*delete) (MemoryContext context);
+	Size		(*get_chunk_space) (MemoryContext context, void *pointer);
+	bool		(*is_empty) (MemoryContext context);
+	void		(*stats) (MemoryContext context, int level);
+#ifdef MEMORY_CONTEXT_CHECKING
+	void		(*check) (MemoryContext context);
+#endif
+} MemoryContextMethods;
+
+
+typedef struct MemoryContextData
+{
+	MemoryContextMethods *methods;		/* virtual function table */
+	MemoryContext parent;		/* NULL if no parent (toplevel context) */
+	MemoryContext firstchild;	/* head of linked list of children */
+	MemoryContext nextchild;	/* next child of same parent */
+	char	   *name;			/* context name (just for debugging) */
+	bool		is_shared;		/* context is shared by threads */
+	GTM_RWLock	lock;			/* lock to protect members if the context is shared */
+} MemoryContextData;
+
+#define MemoryContextIsShared(context) \
+	(((MemoryContextData *)(context))->is_shared)
+
+#define MemoryContextLock(context) \
+	(GTM_RWLockAcquire(&((MemoryContextData *)(context))->lock, GTM_LOCKMODE_WRITE))
+#define MemoryContextUnlock(context) \
+	(GTM_RWLockRelease(&((MemoryContextData *)(context))->lock))
+/*
+ * MemoryContextIsValid
+ *		True iff memory context is valid.
+ *
+ * Add new context types to the set accepted by this macro.
+ */
+#define MemoryContextIsValid(context) \
+	((context) != NULL)
+
+#endif   /* MEMNODES_H */
diff --git a/src/include/gtm/memutils.h b/src/include/gtm/memutils.h
new file mode 100644
index 0000000000..5d89995d4d
--- /dev/null
+++ b/src/include/gtm/memutils.h
@@ -0,0 +1,123 @@
+/*-------------------------------------------------------------------------
+ *
+ * memutils.h
+ *	  This file contains declarations for memory allocation utility
+ *	  functions.  These are functions that are not quite widely used
+ *	  enough to justify going in utils/palloc.h, but are still part
+ *	  of the API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/memutils.h,v 1.64 2008/01/01 19:45:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MEMUTILS_H
+#define MEMUTILS_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/palloc.h"
+#include "gtm/memnodes.h"
+
+/*
+ * MaxAllocSize
+ *		Quasi-arbitrary limit on size of allocations.
+ *
+ * Note:
+ *		There is no guarantee that allocations smaller than MaxAllocSize
+ *		will succeed.  Allocation requests larger than MaxAllocSize will
+ *		be summarily denied.
+ *
+ * XXX This is deliberately chosen to correspond to the limiting size
+ * of varlena objects under TOAST.	See VARATT_MASK_SIZE in postgres.h.
+ *
+ * XXX Also, various places in aset.c assume they can compute twice an
+ * allocation's size without overflow, so beware of raising this.
+ */
+#define MaxAllocSize	((Size) 0x3fffffff)		/* 1 gigabyte - 1 */
+
+#define AllocSizeIsValid(size)	((Size) (size) <= MaxAllocSize)
+
+/*
+ * All chunks allocated by any memory context manager are required to be
+ * preceded by a StandardChunkHeader at a spacing of STANDARDCHUNKHEADERSIZE.
+ * A currently-allocated chunk must contain a backpointer to its owning
+ * context as well as the allocated size of the chunk.	The backpointer is
+ * used by pfree() and repalloc() to find the context to call.	The allocated
+ * size is not absolutely essential, but it's expected to be needed by any
+ * reasonable implementation.
+ */
+typedef struct StandardChunkHeader
+{
+	MemoryContext context;		/* owning context */
+	Size		size;			/* size of data space allocated in chunk */
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	Size		requested_size;
+#endif
+} StandardChunkHeader;
+
+#define STANDARDCHUNKHEADERSIZE  MAXALIGN(sizeof(StandardChunkHeader))
+
+/*
+ * Memory-context-type-independent functions in mcxt.c
+ */
+extern void MemoryContextInit(void);
+extern void MemoryContextReset(MemoryContext context);
+extern void MemoryContextDelete(MemoryContext context);
+extern void MemoryContextResetChildren(MemoryContext context);
+extern void MemoryContextDeleteChildren(MemoryContext context);
+extern void MemoryContextResetAndDeleteChildren(MemoryContext context);
+extern Size GetMemoryChunkSpace(void *pointer);
+extern MemoryContext GetMemoryChunkContext(void *pointer);
+extern bool MemoryContextIsEmpty(MemoryContext context);
+extern void MemoryContextStats(MemoryContext context);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+extern void MemoryContextCheck(MemoryContext context);
+#endif
+extern bool MemoryContextContains(MemoryContext context, void *pointer);
+
+/*
+ * This routine handles the context-type-independent part of memory
+ * context creation.  It's intended to be called from context-type-
+ * specific creation routines, and noplace else.
+ */
+extern MemoryContext MemoryContextCreate(Size size,
+					MemoryContextMethods *methods,
+					MemoryContext parent,
+					const char *name);
+
+
+/*
+ * Memory-context-type-specific functions
+ */
+
+/* aset.c */
+extern MemoryContext AllocSetContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size minContextSize,
+					  Size initBlockSize,
+					  Size maxBlockSize,
+					  bool isShared);
+
+/*
+ * Recommended default alloc parameters, suitable for "ordinary" contexts
+ * that might hold quite a lot of data.
+ */
+#define ALLOCSET_DEFAULT_MINSIZE   0
+#define ALLOCSET_DEFAULT_INITSIZE  (8 * 1024)
+#define ALLOCSET_DEFAULT_MAXSIZE   (8 * 1024 * 1024)
+
+/*
+ * Recommended alloc parameters for "small" contexts that are not expected
+ * to contain much data (for example, a context to contain a query plan).
+ */
+#define ALLOCSET_SMALL_MINSIZE	 0
+#define ALLOCSET_SMALL_INITSIZE  (1 * 1024)
+#define ALLOCSET_SMALL_MAXSIZE	 (8 * 1024)
+
+#endif   /* MEMUTILS_H */
diff --git a/src/include/gtm/palloc.h b/src/include/gtm/palloc.h
new file mode 100644
index 0000000000..380e280694
--- /dev/null
+++ b/src/include/gtm/palloc.h
@@ -0,0 +1,90 @@
+/*-------------------------------------------------------------------------
+ *
+ * palloc.h
+ *	  POSTGRES memory allocator definitions.
+ *
+ * This file contains the basic memory allocation interface that is
+ * needed by almost every backend module.  It is included directly by
+ * postgres.h, so the definitions here are automatically available
+ * everywhere.	Keep it lean!
+ *
+ * Memory allocation occurs within "contexts".	Every chunk obtained from
+ * palloc()/MemoryContextAlloc() is allocated within a specific context.
+ * The entire contents of a context can be freed easily and quickly by
+ * resetting or deleting the context --- this is both faster and less
+ * prone to memory-leakage bugs than releasing chunks individually.
+ * We organize contexts into context trees to allow fine-grain control
+ * over chunk lifetime while preserving the certainty that we will free
+ * everything that should be freed.  See utils/mmgr/README for more info.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/utils/palloc.h,v 1.40 2008/06/28 16:45:22 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PALLOC_H
+#define PALLOC_H
+
+/*
+ * Type MemoryContextData is declared in nodes/memnodes.h.	Most users
+ * of memory allocation should just treat it as an abstract type, so we
+ * do not provide the struct contents here.
+ */
+typedef struct MemoryContextData *MemoryContext;
+
+/*
+ * Fundamental memory-allocation operations (more are in utils/memutils.h)
+ */
+extern void *MemoryContextAlloc(MemoryContext context, Size size);
+extern void *MemoryContextAllocZero(MemoryContext context, Size size);
+extern void *MemoryContextAllocZeroAligned(MemoryContext context, Size size);
+
+#define palloc(sz)	MemoryContextAlloc(CurrentMemoryContext, (sz))
+
+#define palloc0(sz) MemoryContextAllocZero(CurrentMemoryContext, (sz))
+
+/*
+ * The result of palloc() is always word-aligned, so we can skip testing
+ * alignment of the pointer when deciding which MemSet variant to use.
+ * Note that this variant does not offer any advantage, and should not be
+ * used, unless its "sz" argument is a compile-time constant; therefore, the
+ * issue that it evaluates the argument multiple times isn't a problem in
+ * practice.
+ */
+#define palloc0fast(sz) \
+	( MemSetTest(0, sz) ? \
+		MemoryContextAllocZeroAligned(CurrentMemoryContext, sz) : \
+		MemoryContextAllocZero(CurrentMemoryContext, sz) )
+
+extern void pfree(void *pointer);
+
+extern void *repalloc(void *pointer, Size size);
+
+/*
+ * MemoryContextSwitchTo can't be a macro in standard C compilers.
+ * But we can make it an inline function when using GCC.
+ */
+
+extern MemoryContext MemoryContextSwitchTo(MemoryContext context);
+
+/*
+ * These are like standard strdup() except the copied string is
+ * allocated in a context, not with malloc().
+ */
+extern char *MemoryContextStrdup(MemoryContext context, const char *string);
+
+#define pstrdup(str)  MemoryContextStrdup(CurrentMemoryContext, (str))
+
+extern char *pnstrdup(const char *in, Size len);
+
+#if defined(WIN32) || defined(__CYGWIN__)
+extern void *pgport_palloc(Size sz);
+extern char *pgport_pstrdup(const char *str);
+extern void pgport_pfree(void *pointer);
+#endif
+
+#endif   /* PALLOC_H */
diff --git a/src/include/gtm/path.h b/src/include/gtm/path.h
new file mode 100644
index 0000000000..624fd183c9
--- /dev/null
+++ b/src/include/gtm/path.h
@@ -0,0 +1,16 @@
+/*-------------------------------------------------------------------------
+ *
+ * path.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/gtm_c.h"
+
+extern void canonicalize_path(char *path);
diff --git a/src/include/gtm/pqcomm.h b/src/include/gtm/pqcomm.h
new file mode 100644
index 0000000000..cdae6ca284
--- /dev/null
+++ b/src/include/gtm/pqcomm.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqcomm.h
+ *		Definitions common to frontends and backends.
+ *
+ * NOTE: for historical reasons, this does not correspond to pqcomm.c.
+ * pqcomm.c's routines are declared in libpq.h.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqcomm.h,v 1.109 2008/10/28 12:10:44 mha Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQCOMM_H
+#define PQCOMM_H
+
+#include <sys/socket.h>
+#include <netdb.h>
+#ifdef HAVE_SYS_UN_H
+#include <sys/un.h>
+#endif
+#include <netinet/in.h>
+
+typedef struct
+{
+	struct sockaddr_storage addr;
+	size_t	salen;
+} SockAddr;
+
+/* Configure the UNIX socket location for the well known port. */
+
+#define UNIXSOCK_PATH(path, port, sockdir) \
+		snprintf(path, sizeof(path), "%s/.s.PGSQL.%d", \
+				((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
+				DEFAULT_PGSOCKET_DIR, \
+				(port))
+
+/*
+ * Packet lengths are 4 bytes in network byte order.
+ *
+ * The initial length is omitted from the packet layouts appearing below.
+ */
+
+typedef uint32 PacketLen;
+
+/*
+ * In protocol 3.0 and later, the startup packet length is not fixed, but
+ * we set an arbitrary limit on it anyway.	This is just to prevent simple
+ * denial-of-service attacks via sending enough data to run the server
+ * out of memory.
+ */
+#define MAX_STARTUP_PACKET_LENGTH 10000
+
+#endif   /* PQCOMM_H */
diff --git a/src/include/gtm/pqexpbuffer.h b/src/include/gtm/pqexpbuffer.h
new file mode 100644
index 0000000000..7ae0411423
--- /dev/null
+++ b/src/include/gtm/pqexpbuffer.h
@@ -0,0 +1,181 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqexpbuffer.h
+ *	  Declarations/definitions for "PQExpBuffer" functions.
+ *
+ * PQExpBuffer provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data.  All storage is allocated with malloc().
+ *
+ * This module is essentially the same as the backend's StringInfo data type,
+ * but it is intended for use in frontend libpq and client applications.
+ * Thus, it does not rely on palloc() nor elog().
+ *
+ * It does rely on vsnprintf(); if configure finds that libc doesn't provide
+ * a usable vsnprintf(), then a copy of our own implementation of it will
+ * be linked into libpq.
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.h,v 1.21 2008/11/26 16:23:11 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQEXPBUFFER_H
+#define PQEXPBUFFER_H
+
+/*-------------------------
+ * PQExpBufferData holds information about an extensible string.
+ *		data	is the current buffer for the string (allocated with malloc).
+ *		len		is the current string length.  There is guaranteed to be
+ *				a terminating '\0' at data[len], although this is not very
+ *				useful when the string holds binary data rather than text.
+ *		maxlen	is the allocated size in bytes of 'data', i.e. the maximum
+ *				string size (including the terminating '\0' char) that we can
+ *				currently store in 'data' without having to reallocate
+ *				more space.  We must always have maxlen > len.
+ *
+ * An exception occurs if we failed to allocate enough memory for the string
+ * buffer.  In that case data points to a statically allocated empty string,
+ * and len = maxlen = 0.
+ *-------------------------
+ */
+typedef struct PQExpBufferData
+{
+	char	   *data;
+	size_t		len;
+	size_t		maxlen;
+} PQExpBufferData;
+
+typedef PQExpBufferData *PQExpBuffer;
+
+/*------------------------
+ * Test for a broken (out of memory) PQExpBuffer.
+ * When a buffer is "broken", all operations except resetting or deleting it
+ * are no-ops.
+ *------------------------
+ */
+#define PQExpBufferBroken(str)  \
+	((str) == NULL || (str)->maxlen == 0)
+
+/*------------------------
+ * Initial size of the data buffer in a PQExpBuffer.
+ * NB: this must be large enough to hold error messages that might
+ * be returned by PQrequestCancel().
+ *------------------------
+ */
+#define INITIAL_EXPBUFFER_SIZE	256
+
+/*------------------------
+ * There are two ways to create a PQExpBuffer object initially:
+ *
+ * PQExpBuffer stringptr = createGTMPQExpBuffer();
+ *		Both the PQExpBufferData and the data buffer are malloc'd.
+ *
+ * PQExpBufferData string;
+ * initGTMPQExpBuffer(&string);
+ *		The data buffer is malloc'd but the PQExpBufferData is presupplied.
+ *		This is appropriate if the PQExpBufferData is a field of another
+ *		struct.
+ *-------------------------
+ */
+
+/*------------------------
+ * createGTMPQExpBuffer
+ * Create an empty 'PQExpBufferData' & return a pointer to it.
+ */
+extern PQExpBuffer createGTMPQExpBuffer(void);
+
+/*------------------------
+ * initGTMPQExpBuffer
+ * Initialize a PQExpBufferData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+extern void initGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * To destroy a PQExpBuffer, use either:
+ *
+ * destroyGTMPQExpBuffer(str);
+ *		free()s both the data buffer and the PQExpBufferData.
+ *		This is the inverse of createGTMPQExpBuffer().
+ *
+ * termGTMPQExpBuffer(str)
+ *		free()s the data buffer but not the PQExpBufferData itself.
+ *		This is the inverse of initGTMPQExpBuffer().
+ *
+ * NOTE: some routines build up a string using PQExpBuffer, and then
+ * release the PQExpBufferData but return the data string itself to their
+ * caller.	At that point the data string looks like a plain malloc'd
+ * string.
+ */
+extern void destroyGTMPQExpBuffer(PQExpBuffer str);
+extern void termGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * resetGTMPQExpBuffer
+ *		Reset a PQExpBuffer to empty
+ *
+ * Note: if possible, a "broken" PQExpBuffer is returned to normal.
+ */
+extern void resetGTMPQExpBuffer(PQExpBuffer str);
+
+/*------------------------
+ * enlargeGTMPQExpBuffer
+ * Make sure there is enough space for 'needed' more bytes in the buffer
+ * ('needed' does not include the terminating null).
+ *
+ * Returns 1 if OK, 0 if failed to enlarge buffer.  (In the latter case
+ * the buffer is left in "broken" state.)
+ */
+extern int	enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed);
+
+/*------------------------
+ * printfGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and insert it into str.	More space is allocated to str if necessary.
+ * This is a convenience routine that does the same thing as
+ * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer().
+ */
+extern void
+printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendGTMPQExpBuffer
+ * Format text data under the control of fmt (an sprintf-like format string)
+ * and append it to whatever is already in str.  More space is allocated
+ * to str if necessary.  This is sort of like a combination of sprintf and
+ * strcat.
+ */
+extern void
+appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendGTMPQExpBufferStr
+ * Append the given string to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+extern void appendGTMPQExpBufferStr(PQExpBuffer str, const char *data);
+
+/*------------------------
+ * appendGTMPQExpBufferChar
+ * Append a single byte to str.
+ * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster.
+ */
+extern void appendGTMPQExpBufferChar(PQExpBuffer str, char ch);
+
+/*------------------------
+ * appendBinaryGTMPQExpBuffer
+ * Append arbitrary binary data to a PQExpBuffer, allocating more space
+ * if necessary.
+ */
+extern void appendBinaryGTMPQExpBuffer(PQExpBuffer str,
+						const char *data, size_t datalen);
+
+#endif   /* PQEXPBUFFER_H */
diff --git a/src/include/gtm/pqformat.h b/src/include/gtm/pqformat.h
new file mode 100644
index 0000000000..3febf2cf2e
--- /dev/null
+++ b/src/include/gtm/pqformat.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqformat.h
+ *		Definitions for formatting and parsing frontend/backend messages
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqformat.h,v 1.27 2009/01/01 17:23:59 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQFORMAT_H
+#define PQFORMAT_H
+
+#include "gtm/stringinfo.h"
+
+extern void pq_beginmessage(StringInfo buf, char msgtype);
+extern void pq_sendbyte(StringInfo buf, int byt);
+extern void pq_sendbytes(StringInfo buf, const char *data, int datalen);
+extern void pq_sendcountedtext(StringInfo buf, const char *str, int slen,
+				   bool countincludesself);
+extern void pq_sendtext(StringInfo buf, const char *str, int slen);
+extern void pq_sendstring(StringInfo buf, const char *str);
+extern void pq_send_ascii_string(StringInfo buf, const char *str);
+extern void pq_sendint(StringInfo buf, int i, int b);
+extern void pq_sendint64(StringInfo buf, int64 i);
+extern void pq_sendfloat4(StringInfo buf, float4 f);
+extern void pq_sendfloat8(StringInfo buf, float8 f);
+extern void pq_endmessage(Port *myport, StringInfo buf);
+
+extern void pq_puttextmessage(Port *myport, char msgtype, const char *str);
+extern void pq_putemptymessage(Port *myport, char msgtype);
+
+extern int	pq_getmsgbyte(StringInfo msg);
+extern unsigned int pq_getmsgint(StringInfo msg, int b);
+extern int64 pq_getmsgint64(StringInfo msg);
+extern float4 pq_getmsgfloat4(StringInfo msg);
+extern float8 pq_getmsgfloat8(StringInfo msg);
+extern const char *pq_getmsgbytes(StringInfo msg, int datalen);
+extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen);
+extern char *pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes);
+extern const char *pq_getmsgstring(StringInfo msg);
+extern void pq_getmsgend(StringInfo msg);
+extern int pq_getmsgunreadlen(StringInfo msg);
+
+#endif   /* PQFORMAT_H */
diff --git a/src/include/gtm/pqsignal.h b/src/include/gtm/pqsignal.h
new file mode 100644
index 0000000000..e3a53dc3ed
--- /dev/null
+++ b/src/include/gtm/pqsignal.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * pqsignal.h
+ *	  prototypes for the reliable BSD-style signal(2) routine.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/libpq/pqsignal.h,v 1.32 2008/01/01 19:45:58 momjian Exp $
+ *
+ * NOTES
+ *	  This shouldn't be in libpq, but the monitor and some other
+ *	  things need it...
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PQSIGNAL_H
+#define PQSIGNAL_H
+
+#include <signal.h>
+
+#ifdef HAVE_SIGPROCMASK
+extern sigset_t UnBlockSig,
+			BlockSig,
+			AuthBlockSig;
+
+#define PG_SETMASK(mask)	sigprocmask(SIG_SETMASK, mask, NULL)
+#else
+extern int	UnBlockSig,
+			BlockSig,
+			AuthBlockSig;
+
+#ifndef WIN32
+#define PG_SETMASK(mask)	sigsetmask(*((int*)(mask)))
+#else
+#define PG_SETMASK(mask)		pqsigsetmask(*((int*)(mask)))
+int			pqsigsetmask(int mask);
+#endif
+#endif
+
+typedef void (*pqsigfunc) (int);
+
+extern void pqinitmask(void);
+
+extern pqsigfunc pqsignal(int signo, pqsigfunc func);
+
+#endif   /* PQSIGNAL_H */
diff --git a/src/include/gtm/stringinfo.h b/src/include/gtm/stringinfo.h
new file mode 100644
index 0000000000..197aa877a1
--- /dev/null
+++ b/src/include/gtm/stringinfo.h
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * stringinfo.h
+ *	  Declarations/definitions for "StringInfo" functions.
+ *
+ * StringInfo provides an indefinitely-extensible string data type.
+ * It can be used to buffer either ordinary C strings (null-terminated text)
+ * or arbitrary binary data.  All storage is allocated with palloc().
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * $PostgreSQL: pgsql/src/include/lib/stringinfo.h,v 1.35 2008/01/01 19:45:57 momjian Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STRINGINFO_H
+#define STRINGINFO_H
+
+/*-------------------------
+ * StringInfoData holds information about an extensible string.
+ *		data	is the current buffer for the string (allocated with palloc).
+ *		len		is the current string length.  There is guaranteed to be
+ *				a terminating '\0' at data[len], although this is not very
+ *				useful when the string holds binary data rather than text.
+ *		maxlen	is the allocated size in bytes of 'data', i.e. the maximum
+ *				string size (including the terminating '\0' char) that we can
+ *				currently store in 'data' without having to reallocate
+ *				more space.  We must always have maxlen > len.
+ *		cursor	is initialized to zero by makeStringInfo or initStringInfo,
+ *				but is not otherwise touched by the stringinfo.c routines.
+ *				Some routines use it to scan through a StringInfo.
+ *-------------------------
+ */
+typedef struct StringInfoData
+{
+	char	   *data;
+	int			len;
+	int			maxlen;
+	int			cursor;
+} StringInfoData;
+
+typedef StringInfoData *StringInfo;
+
+
+/*------------------------
+ * There are two ways to create a StringInfo object initially:
+ *
+ * StringInfo stringptr = makeStringInfo();
+ *		Both the StringInfoData and the data buffer are palloc'd.
+ *
+ * StringInfoData string;
+ * initStringInfo(&string);
+ *		The data buffer is palloc'd but the StringInfoData is just local.
+ *		This is the easiest approach for a StringInfo object that will
+ *		only live as long as the current routine.
+ *
+ * To destroy a StringInfo, pfree() the data buffer, and then pfree() the
+ * StringInfoData if it was palloc'd.  There's no special support for this.
+ *
+ * NOTE: some routines build up a string using StringInfo, and then
+ * release the StringInfoData but return the data string itself to their
+ * caller.	At that point the data string looks like a plain palloc'd
+ * string.
+ *-------------------------
+ */
+
+/*------------------------
+ * makeStringInfo
+ * Create an empty 'StringInfoData' & return a pointer to it.
+ */
+extern StringInfo makeStringInfo(void);
+
+/*------------------------
+ * initStringInfo
+ * Initialize a StringInfoData struct (with previously undefined contents)
+ * to describe an empty string.
+ */
+extern void initStringInfo(StringInfo str);
+
+/*------------------------
+ * resetStringInfo
+ * Clears the current content of the StringInfo, if any. The
+ * StringInfo remains valid.
+ */
+extern void resetStringInfo(StringInfo str);
+
+/*------------------------
+ * appendStringInfo
+ * Format text data under the control of fmt (an sprintf-style format string)
+ * and append it to whatever is already in str.  More space is allocated
+ * to str if necessary.  This is sort of like a combination of sprintf and
+ * strcat.
+ */
+extern void
+appendStringInfo(StringInfo str, const char *fmt,...)
+/* This extension allows gcc to check the format string */
+__attribute__((format(printf, 2, 3)));
+
+/*------------------------
+ * appendStringInfoVA
+ * Attempt to format text data under the control of fmt (an sprintf-style
+ * format string) and append it to whatever is already in str.	If successful
+ * return true; if not (because there's not enough space), return false
+ * without modifying str.  Typically the caller would enlarge str and retry
+ * on false return --- see appendStringInfo for standard usage pattern.
+ */
+extern bool appendStringInfoVA(StringInfo str, const char *fmt, va_list args);
+
+/*------------------------
+ * appendStringInfoString
+ * Append a null-terminated string to str.
+ * Like appendStringInfo(str, "%s", s) but faster.
+ */
+extern void appendStringInfoString(StringInfo str, const char *s);
+
+/*------------------------
+ * appendStringInfoChar
+ * Append a single byte to str.
+ * Like appendStringInfo(str, "%c", ch) but much faster.
+ */
+extern void appendStringInfoChar(StringInfo str, char ch);
+
+/*------------------------
+ * appendStringInfoCharMacro
+ * As above, but a macro for even more speed where it matters.
+ * Caution: str argument will be evaluated multiple times.
+ */
+#define appendStringInfoCharMacro(str,ch) \
+	(((str)->len + 1 >= (str)->maxlen) ? \
+	 appendStringInfoChar(str, ch) : \
+	 (void)((str)->data[(str)->len] = (ch), (str)->data[++(str)->len] = '\0'))
+
+/*------------------------
+ * appendBinaryStringInfo
+ * Append arbitrary binary data to a StringInfo, allocating more space
+ * if necessary.
+ */
+extern void appendBinaryStringInfo(StringInfo str,
+					   const char *data, int datalen);
+
+/*------------------------
+ * enlargeStringInfo
+ * Make sure a StringInfo's buffer can hold at least 'needed' more bytes.
+ */
+extern void enlargeStringInfo(StringInfo str, int needed);
+
+#endif   /* STRINGINFO_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index f255c44d1c..078b6733e7 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.223 2009/06/11 14:49:11 momjian Exp $
  *
@@ -157,6 +158,9 @@ typedef enum NodeTag
 	T_JoinExpr,
 	T_FromExpr,
 	T_IntoClause,
+#ifdef PGXC
+	T_DistributeBy,
+#endif
 
 	/*
 	 * TAGS FOR EXPRESSION STATE NODES (execnodes.h)
@@ -337,6 +341,7 @@ typedef enum NodeTag
 	T_CreateUserMappingStmt,
 	T_AlterUserMappingStmt,
 	T_DropUserMappingStmt,
+	T_ExecDirectStmt,
 
 	/*
 	 * TAGS FOR PARSE TREE NODES (parsenodes.h)
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 7793f66f20..e0515ba95d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -12,6 +12,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.395 2009/06/18 01:27:02 tgl Exp $
  *
@@ -1335,6 +1336,9 @@ typedef struct CreateStmt
 	List	   *options;		/* options from WITH clause */
 	OnCommitAction oncommit;	/* what do we do at COMMIT? */
 	char	   *tablespacename; /* table space to use, or NULL */
+#ifdef PGXC
+	DistributeBy *distributeby; 	/* distribution to use, or NULL */
+#endif
 } CreateStmt;
 
 /* ----------
@@ -2389,4 +2393,17 @@ typedef struct AlterTSConfigurationStmt
 	bool		missing_ok;		/* for DROP - skip error if missing? */
 } AlterTSConfigurationStmt;
 
+/* PGXC_BEGIN */
+/*
+ * EXECUTE DIRECT statement
+ */
+typedef struct ExecDirectStmt
+{
+	NodeTag		type;
+	bool		coordinator;
+	List	   *nodes;
+	char	   *query;
+} ExecDirectStmt;
+/* PGXC_END */
+
 #endif   /* PARSENODES_H */
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index a41b0e2f7d..36c5e6e633 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -9,6 +9,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/nodes/primnodes.h,v 1.149 2009/06/11 14:49:11 momjian Exp $
  *
@@ -1174,4 +1175,30 @@ typedef struct FromExpr
 	Node	   *quals;			/* qualifiers on join, if any */
 } FromExpr;
 
+#ifdef PGXC
+/*----------
+ * DistributionType - how to distribute the data
+ *
+ *----------
+ */
+typedef enum DistributionType
+{
+	DISTTYPE_REPLICATION,			/* Replicated */
+	DISTTYPE_HASH,				/* Hash partitioned */
+	DISTTYPE_ROUNDROBIN			/* Round Robin */
+} DistributionType;
+
+/*----------
+ * DistributeBy - represents a DISTRIBUTE BY clause in a CREATE TABLE statement
+ *
+ *----------
+ */
+typedef struct DistributeBy
+{
+	NodeTag		type;
+	DistributionType disttype;		/* Distribution type */
+	char	   	*colname;		/* Distribution column name */
+} DistributeBy;
+#endif
+
 #endif   /* PRIMNODES_H */
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 23f5d87a7a..aec7b6b3d9 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -9,6 +9,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/include/parser/kwlist.h,v 1.2 2009/04/06 08:42:53 heikki Exp $
@@ -90,6 +91,7 @@ PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD)
 PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD)
+PG_KEYWORD("coordinator", COORDINATOR, UNRESERVED_KEYWORD)
 PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD)
 PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD)
 PG_KEYWORD("create", CREATE, RESERVED_KEYWORD)
@@ -125,9 +127,13 @@ PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD)
 PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
 PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
 PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
+PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD)
 PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
 PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("distribute", DISTRIBUTE, UNRESERVED_KEYWORD)
+#endif
 PG_KEYWORD("do", DO, RESERVED_KEYWORD)
 PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD)
@@ -169,6 +175,9 @@ PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD)
 PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD)
 PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD)
 PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("hash", HASH, UNRESERVED_KEYWORD)
+#endif
 PG_KEYWORD("having", HAVING, RESERVED_KEYWORD)
 PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD)
@@ -243,6 +252,7 @@ PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
 PG_KEYWORD("nocreatedb", NOCREATEDB, UNRESERVED_KEYWORD)
 PG_KEYWORD("nocreaterole", NOCREATEROLE, UNRESERVED_KEYWORD)
 PG_KEYWORD("nocreateuser", NOCREATEUSER, UNRESERVED_KEYWORD)
+PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD)
 PG_KEYWORD("noinherit", NOINHERIT, UNRESERVED_KEYWORD)
 PG_KEYWORD("nologin", NOLOGIN_P, UNRESERVED_KEYWORD)
 PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
@@ -308,6 +318,9 @@ PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD)
 PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD)
 PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD)
 PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("replication", REPLICATION, UNRESERVED_KEYWORD)
+#endif
 PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD)
 PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD)
 PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD)
@@ -315,8 +328,14 @@ PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD)
 PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD)
 PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD)
 PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("robin", ROBIN, UNRESERVED_KEYWORD)
+#endif
 PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD)
 PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("round", ROUND, UNRESERVED_KEYWORD)
+#endif
 PG_KEYWORD("row", ROW, COL_NAME_KEYWORD)
 PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD)
 PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD)
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index 089c907c0e..319699381d 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/parser/parse_utilcmd.h,v 1.4 2009/01/01 17:24:00 momjian Exp $
  *
@@ -24,5 +25,8 @@ extern IndexStmt *transformIndexStmt(IndexStmt *stmt, const char *queryString);
 extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
 				  List **actions, Node **whereClause);
 extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
+#ifdef PGXC
+extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname);
+#endif
 
 #endif   /* PARSE_UTILCMD_H */
diff --git a/src/include/pgxc/combiner.h b/src/include/pgxc/combiner.h
new file mode 100644
index 0000000000..8c02627b57
--- /dev/null
+++ b/src/include/pgxc/combiner.h
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * combiner.h
+ *
+ *	  Combine responses from multiple Data Nodes
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMBINER_H
+#define COMBINER_H
+
+#include "postgres.h"
+#include "tcop/dest.h"
+
+typedef enum
+{
+	COMBINE_TYPE_NONE,			/* it is known that no row count, do not parse */
+	COMBINE_TYPE_SUM,			/* sum row counts (partitioned, round robin) */
+	COMBINE_TYPE_AVG			/* calculate average (replicated) */
+}	CombineType;
+
+typedef enum
+{
+	REQUEST_TYPE_NOT_DEFINED,	/* not determined yet */
+	REQUEST_TYPE_COMMAND,		/* OK or row count response */
+	REQUEST_TYPE_QUERY,			/* Row description response */
+	REQUEST_TYPE_COPY_IN,		/* Copy In response */
+	REQUEST_TYPE_COPY_OUT		/* Copy Out response */
+}	RequestType;
+
+
+typedef struct
+{
+	int			node_count;
+	CombineType combine_type;
+	CommandDest dest;
+	int			command_complete_count;
+	int			row_count;
+	RequestType request_type;
+	int			description_count;
+	List	   *simple_aggregates;
+}	ResponseCombinerData;
+
+
+typedef ResponseCombinerData *ResponseCombiner;
+
+extern ResponseCombiner CreateResponseCombiner(int node_count,
+					   CombineType combine_type, CommandDest dest);
+extern int CombineResponse(ResponseCombiner combiner, char msg_type,
+				char *msg_body, size_t len);
+extern bool ValidateAndCloseCombiner(ResponseCombiner combiner);
+extern bool ValidateAndResetCombiner(ResponseCombiner combiner);
+extern void AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates);
+
+#endif   /* COMBINER_H */
diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h
new file mode 100644
index 0000000000..e140445a28
--- /dev/null
+++ b/src/include/pgxc/datanode.h
@@ -0,0 +1,76 @@
+/*-------------------------------------------------------------------------
+ *
+ * datanode.h
+ *
+ *	  Utility functions to communicate to Data Node
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef DATANODE_H
+#define DATANODE_H
+#include "combiner.h"
+#include "nodes/pg_list.h"
+#include "utils/snapshot.h"
+#include <unistd.h>
+
+/* Connection to data node maintained by Pool Manager */
+typedef struct PGconn NODE_CONNECTION;
+
+/* Helper structure to access data node from Session */
+typedef enum
+{
+	DN_CONNECTION_STATE_IDLE,
+	DN_CONNECTION_STATE_BUSY,
+	DN_CONNECTION_STATE_COMPLETED,
+	DN_CONNECTION_STATE_ERROR
+
+}	DNConnectionState;
+
+struct data_node_handle
+{
+	/* fd of the connection */
+	int			sock;
+	/* Connection state */
+	char		transaction_status;
+	DNConnectionState state;
+	char	   *error;
+	/* Output buffer */
+	char	   *outBuffer;
+	size_t		outSize;
+	size_t		outEnd;
+	/* Input buffer */
+	char	   *inBuffer;
+	size_t		inSize;
+	size_t		inStart;
+	size_t		inEnd;
+	size_t		inCursor;
+};
+typedef struct data_node_handle DataNodeHandle;
+
+extern void InitMultinodeExecutor(void);
+
+/* Open/close connection routines (invoked from Pool Manager) */
+extern char *DataNodeConnStr(char *host, char *port, char *dbname, char *user,
+				char *password);
+extern NODE_CONNECTION *DataNodeConnect(char *connstr);
+extern void DataNodeClose(NODE_CONNECTION * conn);
+extern int	DataNodeConnected(NODE_CONNECTION * conn);
+extern int	DataNodeConnClean(NODE_CONNECTION * conn);
+extern void DataNodeCleanAndRelease(int code, Datum arg);
+
+/* Multinode Executor */
+extern void DataNodeBegin(void);
+extern int	DataNodeCommit(CommandDest dest);
+extern int	DataNodeRollback(CommandDest dest);
+
+extern int	DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot, bool force_autocommit, List *simple_aggregates, bool is_read_only);
+
+#endif
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
new file mode 100644
index 0000000000..1320b3c6f6
--- /dev/null
+++ b/src/include/pgxc/locator.h
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * locator.h
+ *		Externally declared locator functions
+ *
+ *
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCATOR_H
+#define LOCATOR_H
+
+#define LOCATOR_TYPE_REPLICATED 'R'
+#define LOCATOR_TYPE_HASH 'H'
+#define LOCATOR_TYPE_RANGE 'G'
+#define LOCATOR_TYPE_SINGLE 'S'
+#define LOCATOR_TYPE_RROBIN 'N'
+#define LOCATOR_TYPE_CUSTOM 'C'
+
+#define HASH_SIZE 4096
+#define HASH_MASK 0x00000FFF;
+
+#include "utils/relcache.h"
+
+
+typedef int PartAttrNumber;
+
+typedef struct
+{
+	Oid			relid;
+	char		locatorType;
+	PartAttrNumber partAttrNum; /* if partitioned */
+	char	   *partAttrName;	/* if partitioned */
+	int			nodeCount;
+	List	   *nodeList;
+	ListCell   *roundRobinNode; /* points to next one to use */
+}	RelationLocInfo;
+
+
+extern char *PreferredDataNodes;
+
+extern void InitRelationLocInfo();
+extern char GetLocatorType(Oid relid);
+extern char ConvertToLocatorType(int disttype);
+
+extern char *GetRelationHashColumn(RelationLocInfo * rel_loc_info);
+extern RelationLocInfo *GetRelationLocInfo(Oid relid);
+extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo * src_info);
+extern List *GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue,
+				 int isRead);
+extern bool IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name);
+extern bool IsHashColumnForRelId(Oid relid, char *part_col_name);
+extern int	GetRoundRobinNode(Oid relid);
+
+extern bool IsHashDistributable(Oid col_type);
+extern List *GetAllNodes(void);
+extern int	GetAnyDataNode(void);
+extern void RelationBuildLocator(Relation rel);
+extern void FreeRelationLocInfo(RelationLocInfo * relationLocInfo);
+
+#endif   /* LOCATOR_H */
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
new file mode 100644
index 0000000000..09ff2c0ada
--- /dev/null
+++ b/src/include/pgxc/pgxc.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgxc.h
+ *		PG-XC
+ *
+ *
+ * Portions Copyright (c) 1996-2010  PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifdef PGXC
+
+extern bool isPGXCCoordinator;
+extern bool isPGXCDataNode;
+
+#define IS_PGXC_COORDINATOR isPGXCCoordinator
+#define IS_PGXC_DATANODE isPGXCDataNode
+
+#endif   /* PGXC */
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
new file mode 100644
index 0000000000..eda25a72bb
--- /dev/null
+++ b/src/include/pgxc/planner.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.h
+ *		Externally declared locator functions
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ?
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGXCPLANNER_H
+#define PGXCPLANNER_H
+
+/* for Query_Plan.exec_loc_type can have these OR'ed*/
+#define EXEC_ON_COORD 0x1
+#define EXEC_ON_DATA_NODES 0x2
+
+/* Contains instructions on processing a step of a query.
+ * In the prototype this will be simple, but it will eventually
+ * evolve into a GridSQL-style QueryStep.
+ */
+typedef struct
+{
+	char	   *sql_statement;
+	List	   *nodelist;
+	List	   *simple_aggregates;		/* simple aggregate to combine on this
+										 * step */
+}	Query_Step;
+
+
+/*
+ * The PGXC plan to execute.
+ * In the prototype this will be simple, and queryStepList will
+ * contain just one step.
+ */
+typedef struct
+{
+	int			exec_loc_type;
+	bool		force_autocommit;		/* For CREATE DATABASE */
+	List	   *query_step_list;	/* List of QuerySteps */
+}	Query_Plan;
+
+
+/* For handling simple aggregates (no group by present)
+ * For now, only MAX will be supported.
+ */
+typedef enum
+{
+	AGG_TYPE_MAX,
+	AGG_TYPE_MIN,
+	AGG_TYPE_COUNT,
+	AGG_TYPE_SUM,
+	AGG_TYPE_AVG
+}	SimpleAggType;
+
+
+/* For handling simple aggregates */
+/* For now, only support int/long types */
+typedef struct
+{
+	int			agg_type;		/* SimpleAggType enum */
+	int			column_pos;		/* Only use 1 for now */
+	unsigned long ulong_value;
+	/* Datum agg_value;  PGXCTODO - use Datum, support more types */
+	int			data_len;
+	int			agg_data_type;
+	int			response_count;
+}	SimpleAgg;
+
+/* forbid SQL if unsafe, useful to turn off for development */
+extern bool StrictStatementChecking;
+
+/* forbid SELECT even multi-node ORDER BY */
+extern bool StrictSelectChecking;
+
+extern Query_Plan *
+			GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list);
+extern void
+			FreeQueryPlan(Query_Plan * query_plan);
+extern bool IsHashDistributable(Oid col_type);
+
+#endif   /* PGXCPLANNER_H */
diff --git a/src/include/pgxc/poolcomm.h b/src/include/pgxc/poolcomm.h
new file mode 100644
index 0000000000..3c62f0662e
--- /dev/null
+++ b/src/include/pgxc/poolcomm.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolcomm.h
+ *
+ *	  Definitions for the Pooler-Seesion communications.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef POOLCOMM_H
+#define POOLCOMM_H
+
+#include "lib/stringinfo.h"
+
+#define POOL_BUFFER_SIZE 1024
+#define Socket(port) (port).fdsock
+
+typedef struct
+{
+	/* file descriptors */
+	int			fdsock;
+	/* receive buffer */
+	int			RecvLength;
+	int			RecvPointer;
+	char		RecvBuffer[POOL_BUFFER_SIZE];
+	/* send buffer */
+	int			SendPointer;
+	char		SendBuffer[POOL_BUFFER_SIZE];
+}	PoolPort;
+
+extern int	pool_listen(unsigned short port, const char *unixSocketName);
+extern int	pool_connect(unsigned short port, const char *unixSocketName);
+extern int	pool_getbyte(PoolPort * port);
+extern int	pool_pollbyte(PoolPort * port);
+extern int	pool_getmessage(PoolPort * port, StringInfo s, int maxlen);
+extern int	pool_getbytes(PoolPort * port, char *s, size_t len);
+extern int	pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len);
+extern int	pool_putbytes(PoolPort * port, const char *s, size_t len);
+extern int	pool_flush(PoolPort * port);
+extern int	pool_sendfds(PoolPort * port, int *fds, int count);
+extern int	pool_recvfds(PoolPort * port, int *fds, int count);
+
+#endif   /* POOLCOMM_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
new file mode 100644
index 0000000000..6e88fca3bc
--- /dev/null
+++ b/src/include/pgxc/poolmgr.h
@@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * poolmgr.h
+ *
+ *	  Definitions for the data nodes connection pool.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
+ *
+ * IDENTIFICATION
+ *	  $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef POOLMGR_H
+#define POOLMGR_H
+#include <sys/time.h>
+#include "datanode.h"
+#include "poolcomm.h"
+#include "storage/pmsignal.h"
+
+#define MAX_IDLE_TIME 60
+
+/* TODO move? */
+typedef struct
+{
+	char	   *host;
+	char	   *port;
+	char	   *uname;
+	char	   *password;
+}	DataNodeConnectionInfo;
+
+/* Connection pool entry */
+typedef struct
+{
+	struct timeval released;
+	NODE_CONNECTION *conn;
+}	DataNodePoolSlot;
+
+/* Pool of connections to specified data nodes */
+typedef struct
+{
+	char	   *connstr;
+	int			freeSize;	/* available connections */
+	int			size;  		/* total pool size */
+	DataNodePoolSlot **slot; 
+}	DataNodePool;
+
+/* All pools for specified database */
+typedef struct databasepool
+{
+	Oid			databaseId;
+	char	   *database;
+	DataNodePool **nodePools; /* one for each data node */
+	struct databasepool *next;
+}	DatabasePool;
+
+/* Agent of client session (Pool Manager side) 
+ * Acts as a session manager, grouping connections together
+ */
+typedef struct
+{
+	/* communication channel */
+	PoolPort	port;
+	DatabasePool *pool;
+	DataNodePoolSlot **connections; /* one for each data node */
+}	PoolAgent;
+
+/* Handle to the pool manager (Session's side) */
+typedef struct
+{
+	/* communication channel */
+	PoolPort	port;
+}	PoolHandle;
+
+extern int	NumDataNodes;
+extern int	MinPoolSize;
+extern int	MaxPoolSize;
+extern int	PoolerPort;
+
+extern bool PersistentConnections;
+
+extern char *DataNodeHosts;
+extern char *DataNodePorts;
+extern char *DataNodeUsers;
+extern char *DataNodePwds;
+
+/* Initialize internal structures */
+extern int	PoolManagerInit(void);
+
+/* Destroy internal structures */
+extern int	PoolManagerDestroy(void);
+
+/*
+ * Get handle to pool manager. This function should be called just before
+ * forking off new session. It creates PoolHandle, PoolAgent and a pipe between
+ * them. PoolAgent is stored within Postmaster's memory context and Session
+ * closes it later. PoolHandle is returned and should be store in a local
+ * variable. After forking off it can be stored in global memory, so it will
+ * only be accessible by the process running the session.
+ */
+extern PoolHandle *GetPoolManagerHandle(void);
+
+/*
+ * Called from Postmaster(Coordinator) after fork. Close one end of the pipe and
+ * free memory occupied by PoolHandler
+ */
+extern void PoolManagerCloseHandle(PoolHandle * handle);
+
+/*
+ * Gracefully close connection to the PoolManager
+ */
+extern void PoolManagerDisconnect(PoolHandle * handle);
+
+/*
+ * Called from Session process after fork(). Associate handle with session
+ * for subsequent calls. Associate session with specified database and
+ * initialize respective connection pool
+ */
+extern void PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes);
+
+/* Get pooled connections */
+extern int *PoolManagerGetConnections(List *nodelist);
+
+/* Retun connections back to the pool */
+extern void PoolManagerReleaseConnections(void);
+
+#endif
diff --git a/src/include/postgres.h b/src/include/postgres.h
index c1e4f77386..e8bfd5a391 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -9,6 +9,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1995, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/postgres.h,v 1.92 2009/01/01 17:23:55 momjian Exp $
  *
@@ -693,4 +694,7 @@ extern int ExceptionalCondition(const char *conditionName,
 					 const char *errorType,
 					 const char *fileName, int lineNumber);
 
+//#define PGXC_COORD  // for PGXC coordinator compiling
+//#define PGXC_DATANODE // for PGXC data node compiling
+
 #endif   /* POSTGRES_H */
diff --git a/src/include/postmaster/autovacuum.h b/src/include/postmaster/autovacuum.h
index 3175487af3..952291bcb0 100644
--- a/src/include/postmaster/autovacuum.h
+++ b/src/include/postmaster/autovacuum.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/postmaster/autovacuum.h,v 1.15 2009/01/01 17:24:01 momjian Exp $
  *
@@ -60,4 +61,8 @@ extern void AutovacuumLauncherIAm(void);
 extern Size AutoVacuumShmemSize(void);
 extern void AutoVacuumShmemInit(void);
 
+#ifdef PGXC  /* PGXC_DATANODE */
+bool IsAutoVacuumWorkerProcess(void);
+#endif
+
 #endif   /* AUTOVACUUM_H */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index b250d3f0f2..66a920ded0 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -143,8 +143,9 @@ typedef struct PROC_HDR
  * normal operation. Startup process also consumes one slot, but WAL
  * writer and autovacuum launcher are launched only after it has
  * exited.
+ * Also pool manager process is added
  */
-#define NUM_AUXILIARY_PROCS		3
+#define NUM_AUXILIARY_PROCS		4
 
 
 /* configurable options */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index fab84ee1a0..4431e1bc54 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.26 2009/06/11 14:49:12 momjian Exp $
  *
@@ -26,6 +27,10 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
 extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
 extern void ProcArrayClearTransaction(PGPROC *proc);
 
+#ifdef PGXC  /* PGXC_DATANODE */
+extern void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip);
+extern void UnsetGlobalSnapshotData(void);
+#endif /* PGXC */
 extern Snapshot GetSnapshotData(Snapshot snapshot);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index b50944a547..9c87386288 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -76,7 +76,9 @@ enum config_group
 	COMPAT_OPTIONS_CLIENT,
 	PRESET_OPTIONS,
 	CUSTOM_OPTIONS,
-	DEVELOPER_OPTIONS
+	DEVELOPER_OPTIONS,
+	DATA_NODES,
+	GTM
 };
 
 /*
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index ca9913bda3..5f3a482877 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.114 2009/06/11 14:49:13 momjian Exp $
  *
@@ -20,6 +21,9 @@
 #include "catalog/pg_index.h"
 #include "fmgr.h"
 #include "nodes/bitmapset.h"
+#ifdef PGXC
+#include "pgxc/locator.h"
+#endif
 #include "rewrite/prs2lock.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
@@ -205,6 +209,9 @@ typedef struct RelationData
 
 	/* use "struct" here to avoid needing to include pgstat.h: */
 	struct PgStat_TableStatus *pgstat_info;		/* statistics collection area */
+#ifdef PGXC
+	RelationLocInfo *rd_locator_info;
+#endif
 } RelationData;
 
 /*
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index e5003b669a..835ba95291 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -5,6 +5,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/utils/snapshot.h,v 1.5 2009/06/11 14:49:13 momjian Exp $
  *
@@ -46,7 +47,11 @@ typedef struct SnapshotData
 	 */
 	TransactionId xmin;			/* all XID < xmin are visible to me */
 	TransactionId xmax;			/* all XID >= xmax are invisible to me */
+	TransactionId recent_global_xmin;
 	uint32		xcnt;			/* # of xact ids in xip[] */
+#ifdef PGXC  /* PGXC_COORD */
+	uint32		max_xcnt;		/* Max # of xact in xip[] */
+#endif
 	TransactionId *xip;			/* array of xact IDs in progress */
 	/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
 	int32		subxcnt;		/* # of xact ids in subxip[], -1 if overflow */
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 1428b28d15..e038041519 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -8,6 +8,7 @@
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation
  *
  * $PostgreSQL: pgsql/src/include/utils/syscache.h,v 1.74 2009/01/01 17:24:02 momjian Exp $
  *
@@ -64,6 +65,9 @@ enum SysCacheIdentifier
 	OPEROID,
 	OPFAMILYAMNAMENSP,
 	OPFAMILYOID,
+#ifdef PGXC
+	PGXCCLASSRELID,
+#endif
 	PROCNAMEARGSNSP,
 	PROCOID,
 	RELNAMENSP,
author	Michael P	2010-04-01 01:09:52 +0000
committer	Pavan Deolasee	2011-05-19 16:38:44 +0000
commit	9b1cd1ef2e746b9d68085ecd37eabaa38e2a82f1 (patch)
tree	f220dc274f1d69eb685e822b9079e829525f5d4a
parent	4d53a2f9699547bdc12831d2860c9d44c465e805 (diff)