diff options
author | Michael P | 2010-04-01 01:09:52 +0000 |
---|---|---|
committer | Pavan Deolasee | 2011-05-19 16:38:44 +0000 |
commit | 9b1cd1ef2e746b9d68085ecd37eabaa38e2a82f1 (patch) | |
tree | f220dc274f1d69eb685e822b9079e829525f5d4a | |
parent | 4d53a2f9699547bdc12831d2860c9d44c465e805 (diff) |
Postgres-XC version 0.9
Application of patch PGXC-PG_REL8_4_3.patch.gz
on PostgreSQL version 8.4.3
150 files changed, 33460 insertions, 55 deletions
diff --git a/contrib/Makefile b/contrib/Makefile index e840c8ce6a..f3777962c5 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -39,7 +39,7 @@ WANTED_DIRS = \ tablefunc \ test_parser \ tsearch2 \ - vacuumlo + vacuumlo ifeq ($(with_openssl),yes) WANTED_DIRS += sslinfo diff --git a/src/Makefile b/src/Makefile index 7b00776c4b..02ba3b3926 100644 --- a/src/Makefile +++ b/src/Makefile @@ -16,6 +16,8 @@ include Makefile.global all install installdirs uninstall distprep: $(MAKE) -C port $@ $(MAKE) -C timezone $@ + # GTM should be built before backend because of dependancy + $(MAKE) -C gtm $@ $(MAKE) -C backend $@ $(MAKE) -C backend/utils/mb/conversion_procs $@ $(MAKE) -C backend/snowball $@ @@ -47,6 +49,7 @@ uninstall-local: clean: $(MAKE) -C port $@ $(MAKE) -C timezone $@ + $(MAKE) -C gtm $@ $(MAKE) -C backend $@ $(MAKE) -C backend/snowball $@ $(MAKE) -C include $@ @@ -61,6 +64,7 @@ clean: distclean maintainer-clean: $(MAKE) -C port $@ $(MAKE) -C timezone $@ + $(MAKE) -C gtm $@ $(MAKE) -C backend $@ $(MAKE) -C backend/snowball $@ $(MAKE) -C include $@ diff --git a/src/backend/Makefile b/src/backend/Makefile index 86526d5f1a..4ae230dbd5 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -15,8 +15,8 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \ - main nodes optimizer port postmaster regex rewrite \ - storage tcop tsearch utils $(top_builddir)/src/timezone + pgxc main nodes optimizer port postmaster regex rewrite \ + storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq include $(srcdir)/common.mk @@ -26,7 +26,19 @@ LOCALOBJS += utils/probes.o endif endif -OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a +OBJS = $(SUBDIROBJS) $(LOCALOBJS) \ + $(top_builddir)/src/interfaces/libpq/fe-connect.o \ + $(top_builddir)/src/interfaces/libpq/fe-secure.o \ + $(top_builddir)/src/interfaces/libpq/fe-misc.o \ + $(top_builddir)/src/interfaces/libpq/fe-protocol3.o \ + $(top_builddir)/src/interfaces/libpq/fe-protocol2.o \ + $(top_builddir)/src/interfaces/libpq/fe-exec.o \ + $(top_builddir)/src/interfaces/libpq/fe-auth.o \ + $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \ + $(top_builddir)/src/port/libpgport_srv.a \ + $(top_builddir)/src/gtm/client/libgtmclient.a \ + $(top_builddir)/src/gtm/common/libgtm.a \ + $(top_builddir)/src/gtm/libpq/libpqcomm.a # We put libpgport into OBJS, so remove it from LIBS; also add libldap LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE) @@ -34,6 +46,8 @@ LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE) # The backend doesn't need everything that's in LIBS, however LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS)) +# LIBS := $(LIBS) -lpqcomm +# LDFLAGS += -L$(top_builddir)/src/gtm/libpg ########################################################################## all: submake-libpgport postgres $(POSTGRES_IMP) @@ -43,7 +57,7 @@ ifneq ($(PORTNAME), win32) ifneq ($(PORTNAME), aix) postgres: $(OBJS) - $(CC) $(CFLAGS) $(LDFLAGS) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@ + $(CC) $(CFLAGS) $(LDFLAGS) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@ endif endif diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 38cfe1a277..fe34e4eaaa 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -12,9 +12,12 @@ subdir = src/backend/access/transam top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o +OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o gtm.o include $(top_srcdir)/src/backend/common.mk # ensure that version checks in xlog.c get recompiled when catversion.h changes xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h + +libpg-fe.h: + $(LN_S) $(top_builddir)/contrib/gtm/client/libpg-fe.h $(top_srcdir)/src/include/ diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 8544725abb..8dc23f7039 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -25,6 +25,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.53 2009/06/11 14:48:54 momjian Exp $ * @@ -67,6 +68,11 @@ #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) +#ifdef PGXC +/* Check if there is about a 1 billion XID difference for XID wraparound */ +#define CLOG_WRAP_CHECK_DELTA (2^30 / CLOG_XACTS_PER_PAGE) +#endif + /* * Link to shared-memory data structures for CLOG control @@ -150,6 +156,11 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, Assert(status == TRANSACTION_STATUS_COMMITTED || status == TRANSACTION_STATUS_ABORTED); + if (status == TRANSACTION_STATUS_COMMITTED) + elog(DEBUG1, "Record transaction commit %u", xid); + else + elog(DEBUG1, "Record transaction abort %u", xid); + /* * See how many subxids, if any, are on the same page as the parent, if * any. @@ -565,11 +576,31 @@ ExtendCLOG(TransactionId newestXact) * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ +#ifdef PGXC /* PGXC_COORD || PGXC_DATANODE */ + /* + * In PGXC, it may be that a node is not involved in a transaction, + * and therefore will be skipped, so we need to detect this by using + * the latest_page_number instead of the pg index. + * + * Also, there is a special case of when transactions wrap-around that + * we need to detect. + */ + pageno = TransactionIdToPage(newestXact); + + /* + * The first condition makes sure we did not wrap around + * The second checks if we are still using the same page + */ + if (ClogCtl->shared->latest_page_number - pageno <= CLOG_WRAP_CHECK_DELTA + && pageno <= ClogCtl->shared->latest_page_number) + return; +#else if (TransactionIdToPgIndex(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; pageno = TransactionIdToPage(newestXact); +#endif LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); @@ -579,7 +610,6 @@ ExtendCLOG(TransactionId newestXact) LWLockRelease(CLogControlLock); } - /* * Remove all CLOG segments before the one holding the passed transaction ID * diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c new file mode 100644 index 0000000000..2ecc96a4ac --- /dev/null +++ b/src/backend/access/transam/gtm.c @@ -0,0 +1,226 @@ +/*------------------------------------------------------------------------- + * + * gtm.c + * + * Module interfacing with GTM + * + * + *------------------------------------------------------------------------- + */ + +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" +#include "access/gtm.h" +#include "access/transam.h" +#include "utils/elog.h" + +/* Configuration variables */ +char *GtmHost = "localhost"; +int GtmPort = 6666; +int GtmCoordinatorId = 1; + +extern bool FirstSnapshotSet; + +static GTM_Conn *conn; + +#define CheckConnection() \ + if (GTMPQstatus(conn) != CONNECTION_OK) InitGTM() + + +bool IsGTMConnected() +{ + return conn != NULL; +} + +void +InitGTM() +{ + /* 256 bytes should be enough */ + char conn_str[256]; + + sprintf(conn_str, "host=%s port=%d coordinator_id=%d", GtmHost, GtmPort, GtmCoordinatorId); + + conn = PQconnectGTM(conn_str); + if (GTMPQstatus(conn) != CONNECTION_OK) + { + int save_errno = errno; + + ereport(WARNING, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("can not connect to GTM: %m"))); + + errno = save_errno; + + CloseGTM(); + } +} + +void +CloseGTM() +{ + GTMPQfinish(conn); + conn = NULL; +} + +GlobalTransactionId +BeginTranGTM() +{ + GlobalTransactionId xid = InvalidGlobalTransactionId; + + CheckConnection(); + // TODO Isolation level + if (conn) + xid = begin_transaction(conn, GTM_ISOLATION_RC); + + /* If something went wrong (timeout), try and reset GTM connection + * and retry. This is safe at the beginning of a transaction. + */ + if (!TransactionIdIsValid(xid)) + { + CloseGTM(); + InitGTM(); + if (conn) + xid = begin_transaction(conn, GTM_ISOLATION_RC); + } + return xid; +} + +GlobalTransactionId +BeginTranAutovacuumGTM() +{ + GlobalTransactionId xid = InvalidGlobalTransactionId; + + CheckConnection(); + // TODO Isolation level + if (conn) + xid = begin_transaction_autovacuum(conn, GTM_ISOLATION_RC); + + /* If something went wrong (timeout), try and reset GTM connection and retry. + * This is safe at the beginning of a transaction. + */ + if (!TransactionIdIsValid(xid)) + { + CloseGTM(); + InitGTM(); + if (conn) + xid = begin_transaction_autovacuum(conn, GTM_ISOLATION_RC); + } + return xid; +} + +int +CommitTranGTM(GlobalTransactionId gxid) +{ + int ret; + + if (!GlobalTransactionIdIsValid(gxid)) + return 0; + CheckConnection(); + ret = commit_transaction(conn, gxid); + + /* If something went wrong (timeout), try and reset GTM connection. + * We will close the transaction locally anyway, and closing GTM will force + * it to be closed on GTM. + */ + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + +int +RollbackTranGTM(GlobalTransactionId gxid) +{ + int ret; + + if (!GlobalTransactionIdIsValid(gxid)) + return 0; + CheckConnection(); + ret = abort_transaction(conn, gxid); + + /* If something went wrong (timeout), try and reset GTM connection. + * We will abort the transaction locally anyway, and closing GTM will force + * it to end on GTM. + */ + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + +GTM_Snapshot +GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped) +{ + GTM_Snapshot ret_snapshot = NULL; + CheckConnection(); + if (conn) + ret_snapshot = get_snapshot(conn, gxid, canbe_grouped); + if (ret_snapshot == NULL) + { + CloseGTM(); + InitGTM(); + } + return ret_snapshot; +} + + +/** + * Create a sequence on the GTM. + * + * + */ +int CreateSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval, + GTM_Sequence maxval, GTM_Sequence startval, bool cycle) +{ + GTM_SequenceKeyData seqkey; + CheckConnection(); + seqkey.gsk_keylen = strlen(seqname); + seqkey.gsk_key = seqname; + + return conn ? open_sequence(conn, &seqkey, increment, minval, maxval, startval, cycle) : 0; +} + +/** + * Get the next sequence value + */ +GTM_Sequence +GetNextValGTM(char *seqname) +{ + GTM_Sequence ret = -1; + GTM_SequenceKeyData seqkey; + CheckConnection(); + seqkey.gsk_keylen = strlen(seqname); + seqkey.gsk_key = seqname; + + if (conn) + ret = get_next(conn, &seqkey); + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + +/** + * Drop the sequence + */ +int +DropSequenceGTM(char *seqname) +{ + GTM_SequenceKeyData seqkey; + CheckConnection(); + seqkey.gsk_keylen = strlen(seqname); + seqkey.gsk_key = seqname; + + return conn ? close_sequence(conn, &seqkey) : -1; +} + + diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 9c74e995db..2695085be3 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -21,6 +21,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.24 2009/01/01 17:23:36 momjian Exp $ * @@ -34,6 +35,10 @@ #include "pg_trace.h" #include "utils/snapmgr.h" +#ifdef PGXC +/* Check if there is about a 1 billion XID difference for XID wraparound */ +#define SUBTRANS_WRAP_CHECK_DELTA (2^30 / SUBTRANS_XACTS_PER_PAGE) +#endif /* * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used @@ -307,11 +312,31 @@ ExtendSUBTRANS(TransactionId newestXact) * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ +#ifdef PGXC /* PGXC_COORD || PGXC_DATANODE */ + /* + * In PGXC, it may be that a node is not involved in a transaction, + * and therefore will be skipped, so we need to detect this by using + * the latest_page_number instead of the pg index. + * + * Also, there is a special case of when transactions wrap-around that + * we need to detect. + */ + pageno = TransactionIdToPage(newestXact); + + /* + * The first condition makes sure we did not wrap around + * The second checks if we are still using the same page + */ + if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA + && pageno <= SubTransCtl->shared->latest_page_number) + return; +#else if (TransactionIdToEntry(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; pageno = TransactionIdToPage(newestXact); +#endif LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 6de9c73f6e..4b9071f947 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.54 2009/06/25 19:05:52 heikki Exp $ @@ -68,7 +69,11 @@ #define TWOPHASE_DIR "pg_twophase" /* GUC variable, can't be changed after startup */ +#ifdef PGXC +int max_prepared_xacts = 10; /* We require 2PC */ +#else int max_prepared_xacts = 0; +#endif /* * This struct describes one global transaction that is in prepared state diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 029b2f2deb..4de1080544 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -4,6 +4,7 @@ * postgres OID & XID variables support routines * * Copyright (c) 2000-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.84 2009/04/23 00:23:45 tgl Exp $ @@ -21,6 +22,10 @@ #include "storage/pmsignal.h" #include "storage/proc.h" #include "utils/builtins.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "access/gtm.h" +#endif /* Number of OIDs to prefetch (preallocate) per XLOG write */ @@ -29,6 +34,40 @@ /* pointer to "variable cache" in shared memory (set up by shmem.c) */ VariableCache ShmemVariableCache = NULL; +#ifdef PGXC /* PGXC_DATANODE */ +static TransactionId next_xid = InvalidTransactionId; +static bool force_get_xid_from_gtm = false; + +/* + * Set next transaction id to use + */ +void +SetNextTransactionId(TransactionId xid) +{ + elog (DEBUG1, "[re]setting xid = %d, old_value = %d", xid, next_xid); + next_xid = xid; +} + +/* + * Allow force of getting XID from GTM + * Useful for explicit VACUUM (autovacuum already handled) + */ +void +SetForceXidFromGTM(bool value) +{ + force_get_xid_from_gtm = value; +} + +/* + * See if we should force using GTM + * Useful for explicit VACUUM (autovacuum already handled) + */ +bool +GetForceXidFromGTM(void) +{ + return force_get_xid_from_gtm; +} +#endif /* PGXC */ /* * Allocate the next XID for my new transaction or subtransaction. @@ -39,6 +78,9 @@ TransactionId GetNewTransactionId(bool isSubXact) { TransactionId xid; +#ifdef PGXC + bool increment_xid = true; +#endif /* * During bootstrap initialization, we return the special bootstrap @@ -51,9 +93,100 @@ GetNewTransactionId(bool isSubXact) return BootstrapTransactionId; } +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + /* Get XID from GTM before acquiring the lock. + * The rest of the code will handle it if after obtaining XIDs, + * the lock is acquired in a different order. + * This will help with GTM connection issues- we will not + * block all other processes. + */ + xid = (TransactionId) BeginTranGTM(); + } +#endif + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + if (TransactionIdIsValid(xid)) + { + if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid)) + { + increment_xid = false; + ereport(DEBUG1, + (errmsg("xid (%d) was less than ShmemVariableCache->nextXid (%d)", + xid, ShmemVariableCache->nextXid))); + } + else + ShmemVariableCache->nextXid = xid; + } + else + { + ereport(WARNING, + (errmsg("Xid is invalid."))); + + /* Problem is already reported, so just remove lock and return */ + LWLockRelease(XidGenLock); + return xid; + } + } else if(IS_PGXC_DATANODE) + { + if (IsAutoVacuumWorkerProcess()) + { + if (MyProc->vacuumFlags & PROC_IN_VACUUM) + { + elog (DEBUG1, "Getting XID for autovacuum"); + /* Try and get gxid directly from GTM. + * We use a different function so that GTM knows to + * exclude it from other snapshots. + */ + next_xid = (TransactionId) BeginTranAutovacuumGTM(); + } else { + elog (DEBUG1, "Getting XID for autovacuum worker (analyze)"); + /* try and get gxid directly from GTM */ + next_xid = (TransactionId) BeginTranGTM(); + } + } else if (GetForceXidFromGTM()) + { + elog (DEBUG1, "Force get XID from GTM"); + /* try and get gxid directly from GTM */ + next_xid = (TransactionId) BeginTranGTM(); + } + + if (TransactionIdIsValid(next_xid)) + { + xid = next_xid; + elog(DEBUG1, "TransactionId = %d", next_xid); + next_xid = InvalidTransactionId; /* reset */ + if (!TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid)) + { + /* This should be ok, due to concurrency from multiple coords + * passing down the xids. + * We later do not want to bother incrementing the value + * in shared memory though. + */ + increment_xid = false; + elog(DEBUG1, "xid (%d) does not follow ShmemVariableCache->nextXid (%d)", + xid, ShmemVariableCache->nextXid); + } else + ShmemVariableCache->nextXid = xid; + } + else + { + /* Fallback to default */ + elog(LOG, "Falling back to local Xid. Was = %d, now is = %d", + next_xid, ShmemVariableCache->nextXid); + xid = ShmemVariableCache->nextXid; + + } + } +#else xid = ShmemVariableCache->nextXid; +#endif /* PGXC */ + /*---------- * Check to see if it's safe to assign another XID. This protects against @@ -98,7 +231,6 @@ GetNewTransactionId(bool isSubXact) "You might also need to commit or roll back old prepared transactions.", NameStr(ShmemVariableCache->limit_datname)))); } - /* * If we are allocating the first XID of a new page of the commit log, * zero out that commit-log page before returning. We must do this while @@ -117,7 +249,13 @@ GetNewTransactionId(bool isSubXact) * want the next incoming transaction to try it again. We cannot assign * more XIDs until there is CLOG space for them. */ - TransactionIdAdvance(ShmemVariableCache->nextXid); +#ifdef PGXC /* defined(PGXC_COORD) || defined(PGXC_DATANODE) */ + /* We may not be at the max, which is ok. Do not bother to increment. + * We get this externally anyway, so it should not be needed in theory... + */ + if (increment_xid) +#endif + TransactionIdAdvance(ShmemVariableCache->nextXid); /* * We must store the new XID into the shared ProcArray before releasing @@ -177,7 +315,6 @@ GetNewTransactionId(bool isSubXact) } LWLockRelease(XidGenLock); - return xid; } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 2b6a222477..9ab3c70430 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -7,6 +7,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -20,6 +21,15 @@ #include <time.h> #include <unistd.h> +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "access/gtm.h" +/* PGXC_COORD */ +#include "gtm/gtm_c.h" +#include "pgxc/datanode.h" +/* PGXC_DATANODE */ +#include "postmaster/autovacuum.h" +#endif #include "access/multixact.h" #include "access/subtrans.h" #include "access/transam.h" @@ -51,7 +61,6 @@ #include "utils/snapmgr.h" #include "pg_trace.h" - /* * User-tweakable parameters */ @@ -125,6 +134,9 @@ typedef enum TBlockState typedef struct TransactionStateData { TransactionId transactionId; /* my XID, or Invalid if none */ +#ifdef PGXC /* PGXC_COORD */ + GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */ +#endif SubTransactionId subTransactionId; /* my subxact ID */ char *name; /* savepoint name, if any */ int savepointLevel; /* savepoint level */ @@ -152,6 +164,9 @@ typedef TransactionStateData *TransactionState; */ static TransactionStateData TopTransactionStateData = { 0, /* transaction id */ +#ifdef PGXC + 0, /* global transaction id */ +#endif 0, /* subtransaction id */ NULL, /* savepoint name */ 0, /* savepoint level */ @@ -274,6 +289,43 @@ static void ShowTransactionStateRec(TransactionState state); static const char *BlockStateAsString(TBlockState blockState); static const char *TransStateAsString(TransState state); +#ifdef PGXC /* PGXC_COORD */ +static GlobalTransactionId GetGlobalTransactionId(TransactionState s); + +/* ---------------------------------------------------------------- + * PG-XC Functions + * ---------------------------------------------------------------- + */ + +/* + * GetCurrentGlobalTransactionId + * + * This will return the GXID of the current transaction, + * getting one from the GTM if it's not yet set. Be careful to call this + * only inside a valid xact. + */ +GlobalTransactionId +GetCurrentGlobalTransactionId(void) +{ + return GetGlobalTransactionId(CurrentTransactionState); +} + +/* + * GetGlobalTransactionId + * + * This will return the GXID of the specified transaction, + * getting one from the GTM if it's not yet set. + */ +static GlobalTransactionId +GetGlobalTransactionId(TransactionState s) +{ + if (!GlobalTransactionIdIsValid(s->globalTransactionId)) + s->globalTransactionId = (GlobalTransactionId) GetNewTransactionId(s->parent != NULL); + + return s->globalTransactionId; +} +#endif /* PGXC */ + /* ---------------------------------------------------------------- * transaction state accessors @@ -364,6 +416,7 @@ GetCurrentTransactionId(void) return s->transactionId; } + /* * GetCurrentTransactionIdIfAny * @@ -412,6 +465,15 @@ AssignTransactionId(TransactionState s) * PG_PROC, the subtrans entry is needed to ensure that other backends see * the Xid as "running". See GetNewTransactionId. */ +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + s->transactionId = (TransactionId) GetGlobalTransactionId(s); + elog(DEBUG1, "New transaction id assigned = %d, isSubXact = %s", + s->transactionId, isSubXact ? "true" : "false"); + } + else +#endif s->transactionId = GetNewTransactionId(isSubXact); if (isSubXact) @@ -1458,8 +1520,11 @@ StartTransaction(void) * start processing */ s->state = TRANS_START; +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + s->globalTransactionId = InvalidGlobalTransactionId; /* until assigned */ +#endif s->transactionId = InvalidTransactionId; /* until assigned */ - /* * Make sure we've reset xact state variables */ @@ -1629,7 +1694,24 @@ CommitTransaction(void) latestXid = RecordTransactionCommit(); TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid); - +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + /* Make sure this committed on the DataNodes, + * if so it will just return + */ + DataNodeCommit(DestNone); + CommitTranGTM(s->globalTransactionId); + } + else if (IS_PGXC_DATANODE) + { + /* If we are autovacuum, commit on GTM */ + if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) + && IsGTMConnected()) + CommitTranGTM((GlobalTransactionId) latestXid); + } +#endif + /* * Let others know about no transaction in progress by me. Note that this * must be done _before_ releasing locks we hold and _after_ @@ -1725,6 +1807,13 @@ CommitTransaction(void) s->nChildXids = 0; s->maxChildXids = 0; +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + s->globalTransactionId = InvalidGlobalTransactionId; + else if (IS_PGXC_DATANODE) + SetNextTransactionId(InvalidTransactionId); +#endif + /* * done with commit processing, set current transaction state back to * default @@ -1959,6 +2048,10 @@ PrepareTransaction(void) s->nChildXids = 0; s->maxChildXids = 0; +#ifdef PGXC /* PGXC_DATANODE */ + if (IS_PGXC_DATANODE) + SetNextTransactionId(InvalidTransactionId); +#endif /* * done with 1st phase commit processing, set current transaction state * back to default @@ -2045,7 +2138,23 @@ AbortTransaction(void) latestXid = RecordTransactionAbort(false); TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid); - +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + /* Make sure this is rolled back on the DataNodes, + * if so it will just return + */ + DataNodeRollback(DestNone); + RollbackTranGTM(s->globalTransactionId); + } + else if (IS_PGXC_DATANODE) + { + /* If we are autovacuum, commit on GTM */ + if ((IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) + && IsGTMConnected()) + RollbackTranGTM((GlobalTransactionId) latestXid); + } +#endif /* * Let others know about no transaction in progress by me. Note that this * must be done _before_ releasing locks we hold and _after_ @@ -2130,6 +2239,13 @@ CleanupTransaction(void) s->nChildXids = 0; s->maxChildXids = 0; +#ifdef PGXC /* PGXC_DATANODE */ + if (IS_PGXC_COORDINATOR) + s->globalTransactionId = InvalidGlobalTransactionId; + else if (IS_PGXC_DATANODE) + SetNextTransactionId(InvalidTransactionId); +#endif + /* * done with abort processing, set current transaction state back to * default @@ -4004,6 +4120,10 @@ PushTransaction(void) * We can now stack a minimally valid subtransaction without fear of * failure. */ +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + s->globalTransactionId = InvalidGlobalTransactionId; +#endif s->transactionId = InvalidTransactionId; /* until assigned */ s->subTransactionId = currentSubTransactionId; s->parent = p; diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 5a0f852b6f..969d6f566c 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.250 2009/02/18 15:58:41 heikki Exp $ @@ -42,6 +43,10 @@ #include "utils/ps_status.h" #include "utils/tqual.h" +#ifdef PGXC +#include "pgxc/poolmgr.h" +#endif + extern int optind; extern char *optarg; @@ -329,6 +334,11 @@ AuxiliaryProcessMain(int argc, char *argv[]) switch (auxType) { +#ifdef PGXC /* PGXC_COORD */ + case PoolerProcess: + statmsg = "pooler process"; + break; +#endif case StartupProcess: statmsg = "startup process"; break; @@ -402,6 +412,13 @@ AuxiliaryProcessMain(int argc, char *argv[]) switch (auxType) { +#ifdef PGXC /* PGXC_COORD */ + case PoolerProcess: + /* don't set signals, pool manager has its own agenda */ + PoolManagerInit(); + proc_exit(1); /* should never return */ +#endif + case CheckerProcess: bootstrap_signals(); CheckerModeMain(); diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index ed06048894..2693b426b1 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -12,8 +12,8 @@ include $(top_builddir)/src/Makefile.global OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \ pg_aggregate.o pg_constraint.o pg_conversion.o pg_depend.o pg_enum.o \ - pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o \ - pg_shdepend.o pg_type.o storage.o toasting.o + pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o pg_shdepend.o \ + pg_type.o pgxc_class.o storage.o toasting.o BKIFILES = postgres.bki postgres.description postgres.shdescription @@ -37,6 +37,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \ pg_ts_parser.h pg_ts_template.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ + pgxc_class.h \ toasting.h indexing.h \ ) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 8181cae64a..2932bffd1d 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.89 2009/06/11 14:48:54 momjian Exp $ @@ -50,6 +51,9 @@ #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" +#ifdef PGXC +#include "catalog/pgxc_class.h" +#endif #include "commands/comment.h" #include "commands/dbcommands.h" #include "commands/defrem.h" @@ -144,6 +148,9 @@ static const Oid object_classes[MAX_OCLASS] = { AuthIdRelationId, /* OCLASS_ROLE */ DatabaseRelationId, /* OCLASS_DATABASE */ TableSpaceRelationId /* OCLASS_TBLSPACE */ +#ifdef PGXC + ,PgxcClassRelationId /* OCLASS_PGXCCLASS */ +#endif }; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index f4cf829b46..4f14113c3b 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -68,6 +69,11 @@ #include "utils/syscache.h" #include "utils/tqual.h" +#ifdef PGXC +#include "catalog/pgxc_class.h" +#include "pgxc/locator.h" +#endif + static void AddNewRelationTuple(Relation pg_class_desc, Relation new_rel_desc, @@ -775,6 +781,141 @@ AddNewRelationTuple(Relation pg_class_desc, InsertPgClassTuple(pg_class_desc, new_rel_desc, new_rel_oid, reloptions); } +#ifdef PGXC +/* -------------------------------- + * AddRelationDistribution + * + * Add to pgxc_class table + * -------------------------------- + */ +void +AddRelationDistribution (Oid relid, + DistributeBy *distributeby, + List *parentOids, + TupleDesc descriptor) +{ + char locatortype = '\0'; + int hashalgorithm = 0; + int hashbuckets = 0; + AttrNumber attnum = 0; + + + if (!distributeby) + { + /* + * No distribution specified. + * See if we are a child table, and get distribution information + * from there. + */ + if (list_length(parentOids) > 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Cannot currently distribute a table with more than one parent."))); + } + else if (list_length(parentOids) == 1) + { + /* + * Use parent's distribution + */ + int parentOid; + RelationLocInfo *rel_loc_info; + + parentOid = linitial_oid(parentOids); + rel_loc_info = GetRelationLocInfo(parentOid); + locatortype = rel_loc_info->locatorType; + + switch (locatortype) + { + case LOCATOR_TYPE_HASH: + attnum = rel_loc_info->partAttrNum; + break; + + case LOCATOR_TYPE_REPLICATED: + case LOCATOR_TYPE_RROBIN: + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("Invalid parent table distribution type"))); + break; + } + } else + { + /* + * If no distribution was specified, and we have not chosen + * one based on primary key or foreign key, use first column with + * a supported data type. + */ + Form_pg_attribute attr; + int i; + + locatortype = LOCATOR_TYPE_HASH; + + for (i = 0; i < descriptor->natts; i++) + { + attr = descriptor->attrs[i]; + if (IsHashDistributable(attr->atttypid)) + { + /* distribute on this column */ + attnum = i + 1; + break; + } + } + + /* If we did not find a usable type, fall back to round robin */ + if (attnum == 0) + locatortype = LOCATOR_TYPE_RROBIN; + } + } else + { + /* + * User specified distribution type + */ + switch (distributeby->disttype) + { + case DISTTYPE_HASH: + /* User specified hash column, validate */ + attnum = get_attnum(relid, distributeby->colname); + + if (!IsHashDistributable(descriptor->attrs[attnum-1]->atttypid)) + { + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("Column %s is not a hash distributable data type", + distributeby->colname))); + } + locatortype = LOCATOR_TYPE_HASH; + break; + + case DISTTYPE_REPLICATION: + locatortype = LOCATOR_TYPE_REPLICATED; + break; + + case DISTTYPE_ROUNDROBIN: + locatortype = LOCATOR_TYPE_RROBIN; + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("Invalid distribution type"))); + } + } + + if (locatortype == LOCATOR_TYPE_HASH) + { + /* PGXCTODO */ + /* Use these for now until we make allowing different algorithms more flexible */ + hashalgorithm = 1; + hashbuckets = HASH_SIZE; + } + + PgxcClassCreate (relid, locatortype, attnum, hashalgorithm, hashbuckets); +} +#endif + /* -------------------------------- * AddNewRelationType - diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c new file mode 100644 index 0000000000..a77f242357 --- /dev/null +++ b/src/backend/catalog/pgxc_class.c @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * pgxc_class.c + * routines to support manipulation of the pgxc_class relation + * + * Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/pgxc_class.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "pgxc/locator.h" + +void PgxcClassCreate(Oid pcrelid, + char pclocatortype, + int pcattnum, + int pchashalgorithm, + int pchashbuckets) +{ + Relation pgxcclassrel; + HeapTuple htup; + bool nulls[Natts_pgxc_class]; + Datum values[Natts_pgxc_class]; + int i; + + /* Iterate through edb_linkauth attributes initializing nulls and values */ + for (i = 0; i < Natts_pgxc_class; i++) + { + nulls[i] = false; + values[i] = (Datum) 0; + } + + /* should not happen */ + if(pcrelid == InvalidOid) + { + elog(ERROR,"pgxc class relid invalid."); + return; + } + + values[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid); + values[Anum_pgxc_class_pclocatortype - 1] = ObjectIdGetDatum(pclocatortype); + + if (pclocatortype == LOCATOR_TYPE_HASH) + { + values[Anum_pgxc_class_pcattnum - 1] = ObjectIdGetDatum(pcattnum); + values[Anum_pgxc_class_pchashalgorithm - 1] = ObjectIdGetDatum(pchashalgorithm); + values[Anum_pgxc_class_pchashbuckets - 1] = ObjectIdGetDatum(pchashbuckets); + } + + /* Open the edb_linkauth relation for insertion */ + pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock); + + htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls); + + (void) simple_heap_insert(pgxcclassrel, htup); + + CatalogUpdateIndexes(pgxcclassrel, htup); + + heap_close(pgxcclassrel, RowExclusiveLock); +} + +#ifdef PGXC +/* + * RemovePGXCClass(): + * + * Remove extended PGXC information + * + * arg1: Oid of the relation. + * + */ +void RemovePgxcClass(Oid pcrelid) +{ + Relation relation; + HeapTuple tup; + + /* + * Delete the pgxc_class tuple. + */ + relation = heap_open(PgxcClassRelationId, RowExclusiveLock); + tup = SearchSysCache(PGXCCLASSRELID, + ObjectIdGetDatum(pcrelid), + 0, 0, 0); + + if (!HeapTupleIsValid(tup)) /* should not happen */ + elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid); + + simple_heap_delete(relation, &tup->t_self); + + ReleaseSysCache(tup); + + heap_close(relation, RowExclusiveLock); +} +#endif /* PGXC */ + + diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index c6a01f5b75..e0005905ba 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -37,6 +38,10 @@ #include "parser/parse_coerce.h" #include "parser/parse_func.h" #include "parser/parsetree.h" +#ifdef PGXC +#include "parser/parse_utilcmd.h" +#include "pgxc/pgxc.h" +#endif #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -404,6 +409,30 @@ DefineIndex(RangeVar *heapRelation, (void) index_reloptions(amoptions, reloptions, true); +#ifdef PGXC + /* Make sure we can locally enforce the index */ + if (IS_PGXC_COORDINATOR && (primary || unique)) + { + ListCell *elem; + bool isSafe = false; + + foreach(elem, attributeList) + { + IndexElem *key = (IndexElem *) lfirst(elem); + + if (CheckLocalIndexColumn(rel->rd_locator_info->locatorType, + rel->rd_locator_info->partAttrName, key->name)) + { + isSafe = true; + break; + } + } + if (!isSafe) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash distribution column."))); + } +#endif /* * Prepare arguments for index_create, primarily an IndexInfo structure. * Note that ii_Predicate must be in implicit-AND format. diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index e6c75ab014..a187afa8f2 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -35,6 +36,12 @@ #include "utils/resowner.h" #include "utils/syscache.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +/* PGXC_COORD */ +#include "gtm/gtm_c.h" +#include "access/gtm.h" +#endif /* * We don't want to log each fetching of a value from a sequence, @@ -117,6 +124,13 @@ DefineSequence(CreateSeqStmt *seq) bool null[SEQ_COL_LASTCOL]; int i; NameData name; +#ifdef PGXC /* PGXC_COORD */ + GTM_Sequence start_value = 1; + GTM_Sequence min_value = 1; + GTM_Sequence max_value = InvalidSequenceValue; + GTM_Sequence increment = 1; + bool cycle = false; +#endif /* Check and set all option values */ init_params(seq->options, true, &new, &owned_by); @@ -155,21 +169,33 @@ DefineSequence(CreateSeqStmt *seq) coldef->typename = makeTypeNameFromOid(INT8OID, -1); coldef->colname = "start_value"; value[i - 1] = Int64GetDatumFast(new.start_value); +#ifdef PGXC /* PGXC_COORD */ + start_value = new.start_value; +#endif break; case SEQ_COL_INCBY: coldef->typename = makeTypeNameFromOid(INT8OID, -1); coldef->colname = "increment_by"; value[i - 1] = Int64GetDatumFast(new.increment_by); +#ifdef PGXC /* PGXC_COORD */ + increment = new.increment_by; +#endif break; case SEQ_COL_MAXVALUE: coldef->typename = makeTypeNameFromOid(INT8OID, -1); coldef->colname = "max_value"; value[i - 1] = Int64GetDatumFast(new.max_value); +#ifdef PGXC /* PGXC_COORD */ + max_value = new.max_value; +#endif break; case SEQ_COL_MINVALUE: coldef->typename = makeTypeNameFromOid(INT8OID, -1); coldef->colname = "min_value"; value[i - 1] = Int64GetDatumFast(new.min_value); +#ifdef PGXC /* PGXC_COORD */ + min_value = new.min_value; +#endif break; case SEQ_COL_CACHE: coldef->typename = makeTypeNameFromOid(INT8OID, -1); @@ -185,6 +211,9 @@ DefineSequence(CreateSeqStmt *seq) coldef->typename = makeTypeNameFromOid(BOOLOID, -1); coldef->colname = "is_cycled"; value[i - 1] = BoolGetDatum(new.is_cycled); +#ifdef PGXC /* PGXC_COORD */ + cycle = new.is_cycled; +#endif break; case SEQ_COL_CALLED: coldef->typename = makeTypeNameFromOid(BOOLOID, -1); @@ -308,6 +337,20 @@ DefineSequence(CreateSeqStmt *seq) process_owned_by(rel, owned_by); heap_close(rel, NoLock); + +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + /* We also need to create it on the GTM */ + if (CreateSequenceGTM(name.data, increment, min_value, max_value, + start_value, cycle) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not create sequence"))); + } + } +#endif } /* @@ -481,6 +524,20 @@ nextval_internal(Oid relid) seq = read_info(elm, seqrel, &buf); page = BufferGetPage(buf); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + /* Above, we still use the page as a locking mechanism to handle + * concurrency + */ + result = (int64) GetNextValGTM(RelationGetRelationName(seqrel)); + if (result < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not obtain sequence value"))); + } else + { +#endif last = next = result = seq->last_value; incby = seq->increment_by; maxv = seq->max_value; @@ -636,7 +693,9 @@ nextval_internal(Oid relid) seq->log_cnt = log; /* how much is logged */ END_CRIT_SECTION(); - +#ifdef PGXC /* PGXC_COORD */ + } +#endif UnlockReleaseBuffer(buf); relation_close(seqrel, NoLock); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index f22e7be5d8..3372883714 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -76,6 +77,10 @@ #include "utils/syscache.h" #include "utils/tqual.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "access/gtm.h" +#endif /* * ON COMMIT action list @@ -523,6 +528,18 @@ DefineRelation(CreateStmt *stmt, char relkind) */ CommandCounterIncrement(); +#ifdef PGXC + /* + * Add to pgxc_class. + * we need to do this after CommandCounterIncrement + */ + if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION) + { + AddRelationDistribution (relationId, stmt->distributeby, inheritOids, descriptor); + CommandCounterIncrement(); + } +#endif + /* * Open the new relation and acquire exclusive lock on it. This isn't * really necessary for locking out other backends (since they can't see @@ -739,6 +756,16 @@ RemoveRelations(DropStmt *drop) add_exact_object_address(&obj, objects); + +#ifdef PGXC /* PGXC_COORD */ + /* PGXCTODO: allow the ability to rollback dropping sequences. */ + + /* Drop the sequence */ + if (IS_PGXC_COORDINATOR && classform->relkind == RELKIND_SEQUENCE) + { + DropSequenceGTM(rel->relname); + } +#endif ReleaseSysCache(tuple); } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 732f6d09c3..aed98d98f8 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -10,6 +10,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -57,6 +58,9 @@ #include "utils/syscache.h" #include "utils/tqual.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif /* * GUC parameters @@ -899,6 +903,18 @@ vac_update_datfrozenxid(void) if (dirty) { database_file_update_needed(); + /* + * vac_truncate_clog needs a transaction id to detect wrap-arounds. For + * a autovacuum, this would require the data node to contact the GTM or + * the coordinator and acquire GXID for the vacuum operation. + * + * To avoid this complexity, we disable the CLOG truncation. This is + * perfectly fine for the prototype because we are not handling GXID + * wrap-around in the prototype anyways. In future, this should be + * fixed either by acquiring GXID for the vacuum operation or by + * modifying the wrap-around check logic such that it does not need a + * GXID + */ vac_truncate_clog(newFrozenXid); } } @@ -1026,7 +1042,8 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, if (scanned_all) *scanned_all = false; - +#ifndef PGXC + /* In PG-XC, do these after setting vacuum flags */ /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); @@ -1035,6 +1052,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, * ensures that RecentGlobalXmin is kept truly recent. */ PushActiveSnapshot(GetTransactionSnapshot()); +#endif if (!vacstmt->full) { @@ -1065,6 +1083,19 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, LWLockRelease(ProcArrayLock); } +#ifdef PGXC + elog (DEBUG1, "Starting vacuum transaction"); + /* In PG-XC, do these after setting vacuum flags */ + /* Begin a transaction for vacuuming this relation */ + StartTransactionCommand(); + elog (DEBUG1, "Started vacuum transaction"); + + /* + * Functions in indexes may want a snapshot set. Also, setting + * a snapshot ensures that RecentGlobalXmin is kept truly recent. + */ + PushActiveSnapshot(GetTransactionSnapshot()); +#endif /* * Check for user-requested abort. Note we want this to be inside a * transaction, so xact.c doesn't issue useless WARNING. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 72c9877ffd..895e65e121 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -13,6 +13,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.432 2009/06/18 01:27:02 tgl Exp $ @@ -2402,6 +2403,19 @@ _copyCopyStmt(CopyStmt *from) return newnode; } +#ifdef PGXC +static DistributeBy * +_copyDistributeBy(DistributeBy *from) +{ + DistributeBy *newnode = makeNode(DistributeBy); + + COPY_SCALAR_FIELD(disttype); + COPY_STRING_FIELD(colname); + + return newnode; +} +#endif + static CreateStmt * _copyCreateStmt(CreateStmt *from) { @@ -2414,6 +2428,9 @@ _copyCreateStmt(CreateStmt *from) COPY_NODE_FIELD(options); COPY_SCALAR_FIELD(oncommit); COPY_STRING_FIELD(tablespacename); +#ifdef PGXC + COPY_NODE_FIELD(distributeby); +#endif return newnode; } @@ -4093,7 +4110,11 @@ copyObject(void *from) case T_XmlSerialize: retval = _copyXmlSerialize(from); break; - +#ifdef PGXC + case T_DistributeBy: + retval = _copyDistributeBy(from); + break; +#endif default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(from)); retval = from; /* keep compiler quiet */ diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 041b96971c..fedb5102bb 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -20,6 +20,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.355 2009/06/18 01:27:02 tgl Exp $ @@ -1078,6 +1079,9 @@ _equalCreateStmt(CreateStmt *a, CreateStmt *b) COMPARE_NODE_FIELD(options); COMPARE_SCALAR_FIELD(oncommit); COMPARE_STRING_FIELD(tablespacename); +#ifdef PGXC + COMPARE_NODE_FIELD(distributeby); +#endif return true; } diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index b27cd513a5..98d3c4c9ef 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -1154,6 +1155,22 @@ _readRangeTblEntry(void) READ_DONE(); } +#ifdef PGXC +/* + * _readDistributeBy + */ +static DistributeBy * +_readDistributeBy(void) +{ + READ_LOCALS(DistributeBy); + + READ_ENUM_FIELD(disttype, DistributionType); + READ_STRING_FIELD(colname); + + READ_DONE(); +} +#endif + /* * parseNodeString diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 280443074f..9ffada513a 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -8,6 +8,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -58,6 +59,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "parser/gramparse.h" +#include "pgxc/poolmgr.h" #include "storage/lmgr.h" #include "utils/date.h" #include "utils/datetime.h" @@ -179,6 +181,9 @@ static TypeName *TableFuncTypeName(List *columns); InsertStmt *istmt; VariableSetStmt *vsetstmt; +/* PGXC_BEGIN */ + DistributeBy *distby; +/* PGXC_END */ } %type <node> stmt schema_stmt @@ -197,7 +202,7 @@ static TypeName *TableFuncTypeName(List *columns); DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt - DropForeignServerStmt DropUserMappingStmt ExplainStmt FetchStmt + DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt GrantStmt GrantRoleStmt IndexStmt InsertStmt ListenStmt LoadStmt LockStmt NotifyStmt ExplainableStmt PreparableStmt CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt @@ -250,7 +255,7 @@ static TypeName *TableFuncTypeName(List *columns); %type <str> relation_name copy_file_name database_name access_method_clause access_method attr_name - index_name name file_name cluster_index_specification + index_name name file_name cluster_index_specification %type <list> func_name handler_name qual_Op qual_all_Op subquery_Op opt_class opt_validator validator_clause @@ -323,6 +328,9 @@ static TypeName *TableFuncTypeName(List *columns); %type <boolean> opt_freeze opt_default opt_recheck %type <defelt> opt_binary opt_oids copy_delimiter +%type <list> node_list +%type <str> DirectStmt + %type <boolean> copy_from %type <ival> opt_column event cursor_options opt_hold opt_set_data @@ -415,6 +423,9 @@ static TypeName *TableFuncTypeName(List *columns); %type <windef> window_definition over_clause window_specification %type <str> opt_existing_window_name %type <ival> opt_frame_clause frame_extent frame_bound +/* PGXC_BEGIN */ +%type <distby> OptDistributeBy +/* PGXC_END */ /* @@ -425,6 +436,7 @@ static TypeName *TableFuncTypeName(List *columns); */ /* ordinary key words in alphabetical order */ +/* PGXC - added REPLICATION, DISTRIBUTE, and HASH */ %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION @@ -436,14 +448,17 @@ static TypeName *TableFuncTypeName(List *columns); CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE CLUSTER COALESCE COLLATE COLUMN COMMENT COMMIT COMMITTED CONCURRENTLY CONFIGURATION CONNECTION CONSTRAINT CONSTRAINTS - CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE CREATEDB + CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE CREATEDB CREATEROLE CREATEUSER CROSS CSV CURRENT_P CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DESC - DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P DOUBLE_P DROP +/* PGXC_BEGIN */ + DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTRIBUTE DO DOCUMENT_P DOMAIN_P DOUBLE_P +/* PGXC_END */ + DROP EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EXCEPT EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTERNAL EXTRACT @@ -453,7 +468,9 @@ static TypeName *TableFuncTypeName(List *columns); GLOBAL GRANT GRANTED GREATEST GROUP_P - HANDLER HAVING HEADER_P HOLD HOUR_P +/* PGXC_BEGIN */ + HANDLER HASH HAVING HEADER_P HOLD HOUR_P +/* PGXC_END */ IDENTITY_P IF_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IN_P INCLUDING INCREMENT INDEX INDEXES INHERIT INHERITS INITIALLY @@ -471,7 +488,7 @@ static TypeName *TableFuncTypeName(List *columns); MAPPING MATCH MAXVALUE MINUTE_P MINVALUE MODE MONTH_P MOVE NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NOCREATEDB - NOCREATEROLE NOCREATEUSER NOINHERIT NOLOGIN_P NONE NOSUPERUSER + NOCREATEROLE NOCREATEUSER NODE NOINHERIT NOLOGIN_P NONE NOSUPERUSER NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR @@ -484,8 +501,10 @@ static TypeName *TableFuncTypeName(List *columns); QUOTE RANGE READ REAL REASSIGN RECHECK RECURSIVE REFERENCES REINDEX - RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART - RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE +/* PGXC_BEGIN */ + RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA REPLICATION RESET RESTART + RESTRICT RETURNING RETURNS REVOKE RIGHT ROBIN ROLE ROLLBACK ROUND ROW ROWS RULE +/* PGXC_END */ SAVEPOINT SCHEMA SCROLL SEARCH SECOND_P SECURITY SELECT SEQUENCE SERIALIZABLE SERVER SESSION SESSION_USER SET SETOF SHARE @@ -668,6 +687,7 @@ stmt : | DropUserMappingStmt | DropdbStmt | ExecuteStmt + | ExecDirectStmt | ExplainStmt | FetchStmt | GrantStmt @@ -2036,7 +2056,10 @@ opt_using: *****************************************************************************/ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')' - OptInherit OptWith OnCommitOption OptTableSpace + OptInherit OptWith OnCommitOption OptTableSpace +/* PGXC_BEGIN */ + OptDistributeBy +/* PGXC_END */ { CreateStmt *n = makeNode(CreateStmt); $4->istemp = $2; @@ -2047,10 +2070,21 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')' n->options = $9; n->oncommit = $10; n->tablespacename = $11; + n->distributeby = $12; +/* PGXC_BEGIN */ + if (n->inhRelations != NULL && n->distributeby != NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"), + scanner_errposition(exprLocation((Node *) n->distributeby)))); +/* PGXC_END */ $$ = (Node *)n; } | CREATE OptTemp TABLE qualified_name OF qualified_name - '(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace + '(' OptTableElementList ')' OptWith OnCommitOption OptTableSpace +/* PGXC_BEGIN */ + OptDistributeBy +/* PGXC_END */ { /* SQL99 CREATE TABLE OF <UDT> (cols) seems to be satisfied * by our inheritance capabilities. Let's try it... @@ -2064,6 +2098,14 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')' n->options = $10; n->oncommit = $11; n->tablespacename = $12; + n->distributeby = $13; +/* PGXC_BEGIN */ + if (n->inhRelations != NULL && n->distributeby != NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"), + scanner_errposition(exprLocation((Node *) n->distributeby)))); +/* PGXC_END */ $$ = (Node *)n; } ; @@ -2495,6 +2537,36 @@ OptTableSpace: TABLESPACE name { $$ = $2; } | /*EMPTY*/ { $$ = NULL; } ; +/* PGXC_BEGIN */ +DistributeByHash: DISTRIBUTE BY + | DISTRIBUTE BY HASH + ; + +OptDistributeBy: DistributeByHash '(' name ')' + { + DistributeBy *n = makeNode(DistributeBy); + n->disttype = DISTTYPE_HASH; + n->colname = $3; + $$ = n; + } + | DISTRIBUTE BY REPLICATION + { + DistributeBy *n = makeNode(DistributeBy); + n->disttype = DISTTYPE_REPLICATION; + n->colname = NULL; + $$ = n; + } + | DISTRIBUTE BY ROUND ROBIN + { + DistributeBy *n = makeNode(DistributeBy); + n->disttype = DISTTYPE_ROUNDROBIN; + n->colname = NULL; + $$ = n; + } + | /*EMPTY*/ { $$ = NULL; } + ; +/* PGXC_END */ + OptConsTableSpace: USING INDEX TABLESPACE name { $$ = $4; } | /*EMPTY*/ { $$ = NULL; } ; @@ -6461,6 +6533,47 @@ opt_analyze: /***************************************************************************** * * QUERY: + * EXECUTE DIRECT ON (COORDINATOR | NODE num, ...) query + * + *****************************************************************************/ + +ExecDirectStmt: EXECUTE DIRECT ON COORDINATOR DirectStmt + { + ExecDirectStmt *n = makeNode(ExecDirectStmt); + n->coordinator = TRUE; + n->nodes = NIL; + n->query = $5; + $$ = (Node *)n; + } + | EXECUTE DIRECT ON NODE node_list DirectStmt + { + ExecDirectStmt *n = makeNode(ExecDirectStmt); + n->coordinator = FALSE; + n->nodes = $5; + n->query = $6; + $$ = (Node *)n; + } + ; + +DirectStmt: + Sconst /* by default all are $$=$1 */ + ; + +node_list: + Iconst { $$ = list_make1(makeInteger($1)); } + | node_list ',' Iconst { $$ = lappend($1, makeInteger($3)); } + | '*' + { + int i; + $$ = NIL; + for (i=1; i<=NumDataNodes; i++) + $$ = lappend($$, makeInteger(i)); + } + ; + +/***************************************************************************** + * + * QUERY: * PREPARE <plan_name> [(args, ...)] AS <query> * *****************************************************************************/ @@ -10117,6 +10230,7 @@ ColLabel: IDENT { $$ = $1; } /* "Unreserved" keywords --- available for use as any kind of name. */ +/* PGXC - added DISTRIBUTE, HASH, REPLICATION */ unreserved_keyword: ABORT_P | ABSOLUTE_P @@ -10157,6 +10271,7 @@ unreserved_keyword: | CONTENT_P | CONTINUE_P | CONVERSION_P + | COORDINATOR | COPY | COST | CREATEDB @@ -10178,8 +10293,12 @@ unreserved_keyword: | DELIMITER | DELIMITERS | DICTIONARY + | DIRECT | DISABLE_P | DISCARD +/* PGXC_BEGIN */ + | DISTRIBUTE +/* PGXC_END */ | DOCUMENT_P | DOMAIN_P | DOUBLE_P @@ -10204,6 +10323,9 @@ unreserved_keyword: | GLOBAL | GRANTED | HANDLER +/* PGXC_BEGIN */ + | HASH +/* PGXC_END */ | HEADER_P | HOLD | HOUR_P @@ -10253,6 +10375,7 @@ unreserved_keyword: | NOCREATEDB | NOCREATEROLE | NOCREATEUSER + | NODE | NOINHERIT | NOLOGIN_P | NOSUPERUSER @@ -10294,13 +10417,22 @@ unreserved_keyword: | REPEATABLE | REPLACE | REPLICA +/* PGXC_BEGIN */ + | REPLICATION +/* PGXC_END */ | RESET | RESTART | RESTRICT | RETURNS | REVOKE +/* PGXC_BEGIN */ + | ROBIN +/* PGXC_END */ | ROLE | ROLLBACK +/* PGXC_BEGIN */ + | ROUND +/* PGXC_END */ | ROWS | RULE | SAVEPOINT diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index e5a3621cce..1336e00a45 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -18,6 +18,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/backend/parser/parse_utilcmd.c,v 2.21 2009/06/11 14:49:00 momjian Exp $ * @@ -48,6 +49,11 @@ #include "parser/parse_relation.h" #include "parser/parse_type.h" #include "parser/parse_utilcmd.h" +#ifdef PGXC +#include "pgxc/locator.h" +#include "pgxc/pgxc.h" +#endif + #include "rewrite/rewriteManip.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -75,6 +81,10 @@ typedef struct List *alist; /* "after list" of things to do after creating * the table */ IndexStmt *pkey; /* PRIMARY KEY index, if any */ +#ifdef PGXC + char *fallback_dist_col; /* suggested column to distribute on */ + DistributeBy *distributeby; /* original distribute by column in create table */ +#endif } CreateStmtContext; /* State shared by transformCreateSchemaStmt and its subroutines */ @@ -114,7 +124,9 @@ static void transformFKConstraints(ParseState *pstate, static void transformConstraintAttrs(List *constraintList); static void transformColumnType(ParseState *pstate, ColumnDef *column); static void setSchemaName(char *context_schema, char **stmt_schema_name); - +#ifdef PGXC +static void checkLocalFKConstraints(CreateStmtContext *cxt); +#endif /* * transformCreateStmt - @@ -177,6 +189,10 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) cxt.alist = NIL; cxt.pkey = NULL; cxt.hasoids = interpretOidsOption(stmt->options); +#ifdef PGXC + cxt.fallback_dist_col = NULL; + cxt.distributeby = stmt->distributeby; +#endif /* * Run through each primary element in the table creation clause. Separate @@ -244,6 +260,18 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) result = list_concat(result, cxt.alist); result = list_concat(result, save_alist); +#ifdef PGXC + /* + * If the user did not specify any distribution clause and there is no + * inherits clause, try and use PK or unique index + */ + if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col) + { + stmt->distributeby = (DistributeBy *) palloc0(sizeof(DistributeBy)); + stmt->distributeby->disttype = DISTTYPE_HASH; + stmt->distributeby->colname = cxt.fallback_dist_col; + } +#endif return result; } @@ -307,7 +335,7 @@ transformColumnDefinition(ParseState *pstate, CreateStmtContext *cxt, char *snamespace; char *sname; char *qstring; - A_Const *snamenode; + A_Const *snamenode; TypeCast *castnode; FuncCall *funccallnode; CreateSeqStmt *seqstmt; @@ -1061,6 +1089,7 @@ transformIndexConstraints(ParseState *pstate, CreateStmtContext *cxt) } } + /* * transformIndexConstraint * Transform one UNIQUE or PRIMARY KEY constraint for @@ -1072,6 +1101,10 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) IndexStmt *index; ListCell *keys; IndexElem *iparam; +#ifdef PGXC + bool isLocalSafe = false; +#endif + index = makeNode(IndexStmt); @@ -1126,6 +1159,22 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) if (strcmp(column->colname, key) == 0) { found = true; + +#ifdef PGXC + /* + * Only allow locally enforceable constraints. + * See if it is a distribution column + * If not set, set it to first column in index. + * If primary key, we prefer that over a unique constraint. + */ + if (IS_PGXC_COORDINATOR && !isLocalSafe) + { + if (cxt->distributeby) + isLocalSafe = CheckLocalIndexColumn ( + ConvertToLocatorType(cxt->distributeby->disttype), + cxt->distributeby->colname, key); + } +#endif break; } } @@ -1219,6 +1268,27 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) } } +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + /* + * Set fallback distribution column. + * If not set, set it to first column in index. + * If primary key, we prefer that over a unique constraint. + */ + if (index->indexParams == NIL + && (index->primary || !cxt->fallback_dist_col)) + { + cxt->fallback_dist_col = pstrdup(key); + } + + /* Existing table, check if it is safe */ + if (!cxt->distributeby && !isLocalSafe) + isLocalSafe = CheckLocalIndexColumn ( + cxt->rel->rd_locator_info->locatorType, cxt->rel->rd_locator_info->partAttrName, key); + } +#endif + /* OK, add it to the index definition */ iparam = makeNode(IndexElem); iparam->name = pstrdup(key); @@ -1228,6 +1298,13 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) iparam->nulls_ordering = SORTBY_NULLS_DEFAULT; index->indexParams = lappend(index->indexParams, iparam); } +#ifdef PGXC + if (IS_PGXC_COORDINATOR && cxt->distributeby + && cxt->distributeby->disttype == DISTTYPE_HASH && !isLocalSafe) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash distribution column."))); +#endif return index; } @@ -1256,9 +1333,34 @@ transformFKConstraints(ParseState *pstate, CreateStmtContext *cxt, FkConstraint *fkconstraint = (FkConstraint *) lfirst(fkclist); fkconstraint->skip_validation = true; +#ifdef PGXC + /* + * Set fallback distribution column. + * If not yet set, set it to first column in FK constraint + * if it references a partitioned table + */ + if (IS_PGXC_COORDINATOR && !cxt->fallback_dist_col) + { + Oid pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false); + + /* make sure it is a partitioned column */ + if (IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,0)))) + { + /* take first column */ + char *colstr = strdup(strVal(list_nth(fkconstraint->fk_attrs,0))); + cxt->fallback_dist_col = pstrdup(colstr); + } + } +#endif } } +#ifdef PGXC + /* Only allow constraints that are locally enforceable - no distributed ones */ + if (IS_PGXC_COORDINATOR) + checkLocalFKConstraints(cxt); +#endif + /* * For CREATE TABLE or ALTER TABLE ADD COLUMN, gin up an ALTER TABLE ADD * CONSTRAINT command to execute after the basic command is complete. (If @@ -1714,6 +1816,10 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString) cxt.blist = NIL; cxt.alist = NIL; cxt.pkey = NULL; +#ifdef PGXC + cxt.fallback_dist_col = NULL; + cxt.distributeby = NULL; +#endif /* * The only subtypes that currently require parse transformation handling @@ -2115,3 +2221,118 @@ setSchemaName(char *context_schema, char **stmt_schema_name) "different from the one being created (%s)", *stmt_schema_name, context_schema))); } + +#ifdef PGXC +/* + * CheckLocalIndexColumn + * + * Checks whether or not the index can be safely enforced locally + */ +bool +CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname) +{ + + if (loctype == LOCATOR_TYPE_REPLICATED) + /* always safe */ + return true; + if (loctype == LOCATOR_TYPE_RROBIN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Cannot locally enforce a unique index on round robin distributed table."))); + else if (loctype == LOCATOR_TYPE_HASH) + { + if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0) + return true; + } + return false; +} + + +/* + * check to see if the constraint can be enforced locally + * if not, an error will be thrown + */ +void +static checkLocalFKConstraints(CreateStmtContext *cxt) +{ + ListCell *fkclist; + + foreach(fkclist, cxt->fkconstraints) + { + FkConstraint *fkconstraint; + Oid pk_rel_id; + char refloctype; + char *checkcolname = NULL; + + fkconstraint = (FkConstraint *) lfirst(fkclist); + pk_rel_id = RangeVarGetRelid(fkconstraint->pktable, false); + + refloctype = GetLocatorType(pk_rel_id); + + /* If referenced table is replicated, the constraint is safe */ + if (refloctype == LOCATOR_TYPE_REPLICATED) + continue; + else if (refloctype == LOCATOR_TYPE_RROBIN) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot reference a round robin table in a foreign key constraint"))); + } + + /* + * See if we are hash partitioned and the column appears in the + * constraint, and it corresponds to the position in the referenced table. + */ + if (cxt->isalter) + { + if (cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_HASH) + { + checkcolname = cxt->rel->rd_locator_info->partAttrName; + } + } + else + { + if (cxt->distributeby) + { + if (cxt->distributeby->disttype == DISTTYPE_HASH) + checkcolname = cxt->distributeby->colname; + } + else + { + if (cxt->fallback_dist_col) + checkcolname = cxt->fallback_dist_col; + } + } + + if (checkcolname) + { + int pos = 0; + + ListCell *attritem; + + foreach(attritem, fkconstraint->fk_attrs) + { + char *attrname = (char *) strVal(lfirst(attritem)); + + if (strcmp(cxt->rel->rd_locator_info->partAttrName, attrname) == 0) + { + /* Found the ordinal position in constraint */ + break; + } + pos++; + } + + if (pos >= list_length(fkconstraint->fk_attrs)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash distributed table must include distribution column in index"))); + + /* Verify that the referenced table is partitioned at the same position in the index */ + if (!IsHashColumnForRelId(pk_rel_id, strVal(list_nth(fkconstraint->pk_attrs,pos)))) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash distribution column does not refer to hash distribution column in referenced table."))); + } + } +} +#endif diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile new file mode 100644 index 0000000000..d978720b1c --- /dev/null +++ b/src/backend/pgxc/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the access methods module +# +# +# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation +# +# $PostgreSQL$ +# + +subdir = src/backend/pgxc +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +SUBDIRS = locator plan pool + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/locator/Makefile b/src/backend/pgxc/locator/Makefile new file mode 100644 index 0000000000..026a247940 --- /dev/null +++ b/src/backend/pgxc/locator/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for locator +# +# +# Copyright(C) 2010 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/locator +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = locator.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c new file mode 100644 index 0000000000..995a64cb4a --- /dev/null +++ b/src/backend/pgxc/locator/locator.c @@ -0,0 +1,607 @@ +/*------------------------------------------------------------------------- + * + * locator.c + * Functions that help manage table location information such as + * partitioning and replication information. + * + * + * PGXCTODO - do not use a single mappingTable for all + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <time.h> + +#include "postgres.h" +#include "access/skey.h" +#include "access/relscan.h" +#include "catalog/indexing.h" +#include "catalog/pg_type.h" +#include "nodes/pg_list.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/tqual.h" +#include "pgxc/poolmgr.h" +#include "pgxc/locator.h" + +#include "catalog/pgxc_class.h" +#include "catalog/namespace.h" + + +/* PGXCTODO For prototype, relations use the same hash mapping table. + * Long term, make it a pointer in RelationLocInfo, and have + * similarly handled tables point to the same mapping table, + * to check faster for equivalency + */ +int mappingTable[HASH_SIZE]; + +bool locatorInited = false; + + +/* GUC parameter */ +char *PreferredDataNodes = NULL; + +/* Preferred to use when reading from replicated tables */ +static List *globalPreferredNodes = NIL; + +/* + * init_mapping_table - initializes a mapping table + * + * PGXCTODO + * For the prototype, all partitioned tables will use the same partition map. + * We cannot assume this long term + */ +static void +init_mapping_table(int nodeCount, int mapTable[]) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + { + mapTable[i] = (i % nodeCount) + 1; + } +} + + +/* + * Pick any data node, but try a preferred node + * + */ +int +GetAnyDataNode(void) +{ + /* try and pick from the preferred list */ + if (globalPreferredNodes != NULL) + return linitial_int(globalPreferredNodes); + + return 1; +} + + +/* + * hash_range - hash the key to a value between 0 and HASH_SIZE + * + * Note, this function corresponds to GridSQL hashing + * and is used here to allow us the wire up GridSQL + * to the same underlying nodes + */ +static int +hash_range(char *key) +{ + int i; + int length; + int value; + + if (key == NULL || key == '\0') + { + return 0; + } + + length = strlen(key); + + value = 0x238F13AF * length; + + for (i = 0; i < length; i++) + { + value = value + ((key[i] << i * 5 % 24) & 0x7fffffff); + } + + return (1103515243 * value + 12345) % 65537 & HASH_MASK; +} + +/* + * hash_range_int - hashes the integer key to a value between 0 and HASH_SIZE + * + * See hash_range + */ +static int +hash_range_int(int intkey) +{ + char int_str[13]; /* plenty for 32 bit int */ + + int_str[12] = '\0'; + snprintf(int_str, 12, "%d", intkey); + + return hash_range(int_str); +} + + +/* + * get_node_from_hash - determine node based on hash bucket + * + */ +static int +get_node_from_hash(int hash) +{ + if (hash > HASH_SIZE || hash < 0) + { + ereport(ERROR, (errmsg("Hash value out of range\n"))); + } + + return mappingTable[hash]; +} + + +/* + * Returns whether or not the data type is hash distributable with PG-XC + * PGXCTODO - expand support for other data types! + */ +bool +IsHashDistributable(Oid col_type) +{ + if (col_type == INT4OID || col_type == INT2OID) + return true; + + return false; +} + + +/* + * get_hash_column - return hash column for relation. + * + * Returns NULL if the relation is not hash partitioned. + */ +char * +GetRelationHashColumn(RelationLocInfo * rel_loc_info) +{ + char *column_str = NULL; + + if (rel_loc_info == NULL) + column_str = NULL; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + column_str = NULL; + else + { + int len = strlen(rel_loc_info->partAttrName); + + column_str = (char *) palloc(len + 1); + strncpy(column_str, rel_loc_info->partAttrName, len + 1); + } + + return column_str; +} + +/* + * IsHashColumn - return whether or not column for relation is hashed. + * + */ +bool +IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name) +{ + bool ret_value = false; + + if (!rel_loc_info || !part_col_name) + ret_value = false; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + ret_value = false; + else + ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName); + + return ret_value; +} + + +/* + * IsHashColumnForRelId - return whether or not column for relation is hashed. + * + */ +bool +IsHashColumnForRelId(Oid relid, char *part_col_name) +{ + RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid); + + return IsHashColumn(rel_loc_info, part_col_name); +} + + +/** + * Update the round robin node for the relation + * + * PGXCTODO - may not want to bother with locking here, we could track + * these in the session memory context instead... + */ +int +GetRoundRobinNode(Oid relid) +{ + int ret_node; + + Relation rel = relation_open(relid, AccessShareLock); + + Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED || + rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN); + + ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode); + + /* Move round robin indicator to next node */ + if (rel->rd_locator_info->roundRobinNode->next != NULL) + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next; + else + /* reset to first one */ + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head; + + relation_close(rel, AccessShareLock); + + return ret_node; +} + + +/* + * GetRelationNodes + * + * Get list of relation nodes for read operation. + * If the table is replicated and we are reading, we can just pick one. + * If the table is partitioned, we apply partitioning column value, if possible. + * + * If the relation is partitioned, partValue will be applied if present + * (indicating a value appears for partitioning column), otherwise it + * is ignored. + * + * preferredNodes is only used when for replicated tables. If set, it will + * use one of the nodes specified if the table is replicated on it. + * This helps optimize for avoiding introducing additional nodes into the + * transaction. + * + * The returned List is a copy, so it should be freed when finished. + */ +List * +GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue, int isRead) +{ + ListCell *prefItem; + ListCell *stepItem; + List *destList = NULL; + + + if (rel_loc_info == NULL) + return NULL; + + switch (rel_loc_info->locatorType) + { + case LOCATOR_TYPE_REPLICATED: + + if (!isRead) + /* we need to write to all synchronously */ + destList = list_copy(rel_loc_info->nodeList); + else + { + destList = NULL; + + if (globalPreferredNodes != NULL) + { + /* try and pick from the preferred list */ + foreach(prefItem, globalPreferredNodes) + /* make sure it is valid for this relation */ + foreach(stepItem, rel_loc_info->nodeList) + if (lfirst_int(stepItem) == lfirst_int(prefItem)) + { + destList = lappend_int(NULL, lfirst_int(prefItem)); + break; + } + } + } + + if (destList == NULL) + { + /* + * read from just one of them + * use round robin mechanism + */ + destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); + } + break; + + case LOCATOR_TYPE_HASH: + + if (partValue != NULL) + { + /* in prototype, all partitioned tables use same map */ + destList = lappend_int(NULL, get_node_from_hash(hash_range_int(*partValue))); + } + else + { + /* + * No partitioning value passed in + * (no where qualification on part column - use all) + */ + destList = list_copy(rel_loc_info->nodeList); + } + break; + + case LOCATOR_TYPE_SINGLE: + + /* just return first (there should only be one) */ + destList = list_copy(rel_loc_info->nodeList); + break; + + case LOCATOR_TYPE_RROBIN: + + /* round robin, get next one */ + if (isRead) + { + /* we need to read from all */ + destList = list_copy(rel_loc_info->nodeList); + } + else + { + /* write to just one of them */ + destList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); + } + + break; + + /* PGXCTODO case LOCATOR_TYPE_RANGE: */ + /* PGXCTODO case LOCATOR_TYPE_CUSTOM: */ + default: + ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n", + rel_loc_info->locatorType))); + break; + } + + return destList; +} + + +/* + * ConvertToLocatorType + * get locator distribution type + * We really should just have pgxc_class use disttype instead... + */ +char +ConvertToLocatorType(int disttype) +{ + char loctype; + + switch (disttype) + { + case DISTTYPE_HASH: + loctype = LOCATOR_TYPE_HASH; + break; + case DISTTYPE_ROUNDROBIN: + loctype = LOCATOR_TYPE_RROBIN; + break; + case DISTTYPE_REPLICATION: + loctype = LOCATOR_TYPE_REPLICATED; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("Invalid distribution type"))); + break; + } + + return loctype; +} + + +/* + * GetLocatorType - Returns the locator type of the table + * + */ +char +GetLocatorType(Oid relid) +{ + char ret = '\0'; + + RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid); + + if (ret_loc_info != NULL) + ret = ret_loc_info->locatorType; + + return ret; +} + + +/* + * Return a list of all nodes. + * We assume all tables use all nodes in the prototype, so just return a list + * from first one. + */ +List * +GetAllNodes(void) +{ + int i; + + /* + * PGXCTODO - add support for having nodes on a subset of nodes + * For now, assume on all nodes + */ + List *nodeList = NIL; + + for (i = 1; i < NumDataNodes + 1; i++) + { + nodeList = lappend_int(nodeList, i); + } + + return nodeList; +} + + +/** + * Build locator information associated with the specified relation. + * + */ +void +RelationBuildLocator(Relation rel) +{ + Relation pcrel; + ScanKeyData skey; + SysScanDesc pcscan; + HeapTuple htup; + MemoryContext oldContext; + RelationLocInfo *relationLocInfo; + int i; + int offset; + Form_pgxc_class pgxc_class; + + + /** PGXCTODO temporarily use the same mapping table for all + * Use all nodes. + */ + if (!locatorInited) + { + init_mapping_table(NumDataNodes, mappingTable); + locatorInited = true; + } + + ScanKeyInit(&skey, + Anum_pgxc_class_pcrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(rel))); + + pcrel = heap_open(PgxcClassRelationId, AccessShareLock); + pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true, + SnapshotNow, 1, &skey); + htup = systable_getnext(pcscan); + + if (!HeapTupleIsValid(htup)) + { + /* Assume local relation only */ + rel->rd_locator_info = NULL; + systable_endscan(pcscan); + heap_close(pcrel, AccessShareLock); + return; + } + + pgxc_class = (Form_pgxc_class) GETSTRUCT(htup); + + oldContext = MemoryContextSwitchTo(CacheMemoryContext); + + relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo)); + rel->rd_locator_info = relationLocInfo; + + relationLocInfo->relid = RelationGetRelid(rel); + relationLocInfo->locatorType = pgxc_class->pclocatortype; + + relationLocInfo->partAttrNum = pgxc_class->pcattnum; + + relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, + pgxc_class->pcattnum); + + /** PGXCTODO - add support for having nodes on a subset of nodes + * For now, assume on all nodes + */ + relationLocInfo->nodeList = GetAllNodes(); + relationLocInfo->nodeCount = relationLocInfo->nodeList->length; + + /* + * If the locator type is round robin, we set a node to + * use next time. In addition, if it is replicated, + * we choose a node to use for balancing reads. + */ + if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN + || relationLocInfo->locatorType == LOCATOR_TYPE_REPLICATED) + { + /* + * pick a random one to start with, + * since each process will do this independently + */ + srand(time(NULL)); + offset = rand() % relationLocInfo->nodeCount + 1; + relationLocInfo->roundRobinNode = relationLocInfo->nodeList->head; /* initialize */ + + for (i = 0; i < offset && relationLocInfo->roundRobinNode->next != NULL; i++) + { + relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next; + } + } + + systable_endscan(pcscan); + heap_close(pcrel, AccessShareLock); + + MemoryContextSwitchTo(oldContext); +} + +/* + * GetLocatorRelationInfo - Returns the locator information for relation, + * in a copy of the RelationLocatorInfo struct in relcache + * + */ +RelationLocInfo * +GetRelationLocInfo(Oid relid) +{ + RelationLocInfo *ret_loc_info = NULL; + + Relation rel = relation_open(relid, AccessShareLock); + + if (rel && rel->rd_locator_info) + ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info); + + relation_close(rel, AccessShareLock); + + return ret_loc_info; +} + +/** + * Copy the RelationLocInfo struct + */ +RelationLocInfo * +CopyRelationLocInfo(RelationLocInfo * src_info) +{ + RelationLocInfo *dest_info; + + + Assert(src_info); + + dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo)); + + dest_info->relid = src_info->relid; + dest_info->locatorType = src_info->locatorType; + dest_info->partAttrNum = src_info->partAttrNum; + if (src_info->partAttrName) + dest_info->partAttrName = pstrdup(src_info->partAttrName); + dest_info->nodeCount = src_info->nodeCount; + if (src_info->nodeList) + dest_info->nodeList = list_copy(src_info->nodeList); + + /* Note, for round robin, we use the relcache entry */ + + return dest_info; +} + + +/** + * Free RelationLocInfo struct + */ +void +FreeRelationLocInfo(RelationLocInfo *relationLocInfo) +{ + if (relationLocInfo) + { + if (relationLocInfo->partAttrName) + pfree(relationLocInfo->partAttrName); + pfree(relationLocInfo); + } +} diff --git a/src/backend/pgxc/plan/Makefile b/src/backend/pgxc/plan/Makefile new file mode 100644 index 0000000000..c0e65741f1 --- /dev/null +++ b/src/backend/pgxc/plan/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for rewrite +# +# +# Portions Copyright(C) 2010 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/plan +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = planner.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c new file mode 100644 index 0000000000..90703c4896 --- /dev/null +++ b/src/backend/pgxc/plan/planner.c @@ -0,0 +1,1290 @@ +/*------------------------------------------------------------------------- + * + * planner.c + * + * Functions for generating a PGXC style plan. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "access/transam.h" +#include "catalog/pg_type.h" +#include "nodes/parsenodes.h" +#include "pgxc/locator.h" +#include "pgxc/planner.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" + + +/* + * Convenient format for literal comparisons + * + * PGXCTODO - make constant type Datum, handle other types + */ +typedef struct +{ + Oid relid; + RelationLocInfo *rel_loc_info; + Oid attrnum; + char *col_name; + long constant; /* assume long PGXCTODO - should be Datum */ +} Literal_Comparison; + +/* + * This struct helps us detect special conditions to determine what nodes + * to execute on. + */ +typedef struct +{ + List *partitioned_literal_comps; /* List of Literal_Comparison */ + List *partitioned_parent_child; + List *replicated_joins; + + /* + * Used when joining a single replicated or non-replicated table with + * other replicated tables. Use as a basis for partitioning determination. + */ + char *base_rel_name; + RelationLocInfo *base_rel_loc_info; + +} Special_Conditions; + +/* If two relations are joined based on special location information */ +typedef enum PGXCJoinType +{ + JOIN_REPLICATED, + JOIN_COLOCATED_PARTITIONED, + JOIN_OTHER +} PGXCJoinType; + +/* used to track which tables are joined */ +typedef struct +{ + int relid1; /* the first relation */ + char *aliasname1; + int relid2; /* the second relation */ + char *aliasname2; + + PGXCJoinType join_type; +} PGXC_Join; + +/* A list of List*'s, one for each relation. */ +List *join_list = NULL; + +/* Forbid unsafe SQL statements */ +bool StrictStatementChecking = true; + +/* Forbid multi-node SELECT statements with an ORDER BY clause */ +bool StrictSelectChecking = false; + +/* + * Create a new join struct for tracking how relations are joined + */ +static PGXC_Join * +new_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) +{ + PGXC_Join *pgxc_join = (PGXC_Join *) palloc(sizeof(PGXC_Join)); + + if (relid1 < relid2) + { + pgxc_join->relid1 = relid1; + pgxc_join->relid2 = relid2; + pgxc_join->aliasname1 = aliasname1; + pgxc_join->aliasname2 = aliasname2; + } + else + { + pgxc_join->relid1 = relid2; + pgxc_join->relid2 = relid1; + pgxc_join->aliasname1 = aliasname2; + pgxc_join->aliasname2 = aliasname1; + } + + pgxc_join->join_type = JOIN_OTHER; + + return pgxc_join; +} + + +/* + * Look up the join struct for a particular join + */ +static PGXC_Join * +find_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) +{ + ListCell *lc; + + /* return if list is still empty */ + if (join_list == NULL) + return NULL; + + /* in the PGXC_Join struct, we always sort with relid1 < relid2 */ + if (relid2 < relid1) + { + int tmp = relid1; + char *tmpalias = aliasname1; + + relid1 = relid2; + aliasname1 = aliasname2; + relid2 = tmp; + aliasname2 = tmpalias; + } + + /* + * there should be a small number, so we just search linearly, although + * long term a hash table would be better. + */ + foreach(lc, join_list) + { + PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc); + + if (pgxcjoin->relid1 == relid1 && pgxcjoin->relid2 == relid2 + && !strcmp(pgxcjoin->aliasname1, aliasname1) + && !strcmp(pgxcjoin->aliasname2, aliasname2)) + return pgxcjoin; + } + return NULL; +} + +/* + * Find or create a join between 2 relations + */ +static PGXC_Join * +find_or_create_pgxc_join(int relid1, char *aliasname1, int relid2, char *aliasname2) +{ + PGXC_Join *pgxcjoin; + + pgxcjoin = find_pgxc_join(relid1, aliasname1, relid2, aliasname2); + + if (pgxcjoin == NULL) + { + pgxcjoin = new_pgxc_join(relid1, aliasname1, relid2, aliasname2); + join_list = lappend(join_list, pgxcjoin); + } + + return pgxcjoin; +} + + +/* + * new_special_conditions - Allocate Special_Conditions struct and initialize + */ +static Special_Conditions * +new_special_conditions() +{ + Special_Conditions *special_conditions = + (Special_Conditions *) palloc0(sizeof(Special_Conditions)); + + return special_conditions; +} + +/* + * free Special_Conditions struct + */ +static void +free_special_relations(Special_Conditions * special_conditions) +{ + if (special_conditions == NULL) + return; + + /* free all items in list, including Literal_Comparison struct */ + list_free_deep(special_conditions->partitioned_literal_comps); + + /* free list, but not items pointed to */ + list_free(special_conditions->partitioned_parent_child); + list_free(special_conditions->replicated_joins); + + pfree(special_conditions); +} + +/* + * frees join_list + */ +static void +free_join_list() +{ + if (join_list == NULL) + return; + + /* free all items in list including PGXC_Join struct */ + list_free_deep(join_list); +} + +/* + * get_numeric_constant - extract casted constant + * + * Searches an expression to see if it is a Constant that is being cast + * to numeric. Return a pointer to the Constant, or NULL. + * We need this because of casting. + */ +static Expr * +get_numeric_constant(Expr *expr) +{ + + if (expr == NULL) + return NULL; + + if (IsA(expr, Const)) + return expr; + + /* We may have a cast, represented by a function */ + if (IsA(expr, FuncExpr)) + { + FuncExpr *funcexpr = (FuncExpr *) expr; + + /* try and get at what is being cast */ + /* We may have an implicit double-cast, so we do this recurisvely */ + if (funcexpr->funcid == F_NUMERIC || funcexpr->funcid == F_INT4_NUMERIC) + { + return get_numeric_constant(linitial(funcexpr->args)); + } + } + + return NULL; +} + + +/* + * get_base_var_table_and_column - determine the base table and column + * + * This is required because a RangeTblEntry may actually be another + * type, like a join, and we need to then look at the joinaliasvars + * to determine what the base table and column really is. + */ +static Var * +get_base_var(Var * var, List *rtables) +{ + RangeTblEntry *rte; + + /* get the RangeTableEntry */ + rte = list_nth(rtables, var->varno - 1); + + if (rte->rtekind == RTE_RELATION) + return var; + else if (rte->rtekind == RTE_JOIN) + { + Var *colvar = list_nth(rte->joinaliasvars, var->varattno - 1); + + /* continue resolving recursively */ + return get_base_var(colvar, rtables); + } + else + { + return NULL; + } +} + + +/* + * get_plan_nodes_insert - determine nodes on which to execute insert. + */ +static List * +get_plan_nodes_insert(Query * query) +{ + RangeTblEntry *rte; + RelationLocInfo *rel_loc_info; + Const *constant; + List *nodelist; + ListCell *lc; + long part_value; + long *part_value_ptr = NULL; + + + nodelist = NULL; + + /* Looks complex (correlated?) - best to skip */ + if (query->jointree != NULL && query->jointree->fromlist != NULL) + return NULL; + + /* Make sure there is just one table */ + if (query->rtable == NULL || query->rtable->length != 1) + return NULL; + + rte = (RangeTblEntry *) lfirst(list_head(query->rtable)); + + if (rte != NULL && rte->rtekind != RTE_RELATION) + /* Bad relation type */ + return NULL; + + /* See if we have the partitioned case. */ + rel_loc_info = GetRelationLocInfo(rte->relid); + + if (!rel_loc_info) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Could not find relation for oid = %d", rte->relid)))); + + if (rel_loc_info->locatorType == LOCATOR_TYPE_HASH + && rel_loc_info->partAttrName != NULL) + { + /* It is a partitioned table, get value by looking in targetList */ + foreach(lc, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + + /* + * See if we have a constant expression comparing against the + * designated partitioned column + */ + if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0) + { + /* We may have a cast, try and handle it */ + Expr *checkexpr = get_numeric_constant(tle->expr); + + if (checkexpr == NULL) + break; /* no constant */ + + constant = (Const *) checkexpr; + + if (constant->consttype == INT4OID + || constant->consttype == INT2OID + || constant->consttype == INT8OID) + { + part_value = (long) constant->constvalue; + part_value_ptr = &part_value; + + } + /* PGXCTODO - handle other data types */ + /* + else + if (constant->consttype == VARCHAR ... + */ + } + } + } + + /* single call handles both replicated and partitioned types */ + nodelist = GetRelationNodes(rel_loc_info, part_value_ptr, false); + + return nodelist; +} + + +/* + * examine_conditions + * + * Examine conditions and find special ones to later help us determine + * what tables can be joined together. Put findings in Special_Conditions + * struct. + * + * Get list of constant comparisons conditions on partitioned column + * Get list of parent-child joins (partitioned together) + * Get list of joins with replicated tables + * + * If we encounter a cross-node join, we stop processing and return false, + * otherwise true. + * + * PGXCTODO: Recognize subqueries, and give up (long term allow safe ones). + * + */ +static bool +examine_conditions(Special_Conditions * conditions, List *rtables, Node *expr_node) +{ + char *rel_name, + *rel_name2; + char *col_name, + *col_name2; + RelationLocInfo *rel_loc_info1, + *rel_loc_info2; + Const *constant; + Expr *checkexpr; + + + if (expr_node == NULL) + return true; + + if (rtables == NULL) + return true; + + if (conditions == NULL) + conditions = new_special_conditions(); + + if (IsA(expr_node, BoolExpr)) + { + BoolExpr *boolexpr = (BoolExpr *) expr_node; + + /* Recursively handle ANDed expressions, but don't handle others */ + if (boolexpr->boolop == AND_EXPR) + { + if (!examine_conditions(conditions, rtables, + linitial(boolexpr->args))) + return false; + + return examine_conditions( + conditions, rtables, lsecond(boolexpr->args)); + } + else if (boolexpr->boolop == OR_EXPR) + { + /* + * look at OR's as work-around for reported issue. + * NOTE: THIS IS NOT CORRECT, BUT JUST DONE FOR THE PROTOTYPE. + * More rigorous + * checking needs to be done. PGXCTODO: Add careful checking for + * OR'ed conditions... + */ + if (!examine_conditions(conditions, rtables, + linitial(boolexpr->args))) + return false; + + return examine_conditions( + conditions, rtables, lsecond(boolexpr->args)); + } + else + /* looks complicated, give up */ + return false; + + return true; + } + + + if (IsA(expr_node, OpExpr)) + { + OpExpr *opexpr = (OpExpr *) expr_node; + + /* See if we can equijoin these */ + if (op_mergejoinable(opexpr->opno) && opexpr->args->length == 2) + { + Expr *arg1 = linitial(opexpr->args); + Expr *arg2 = lsecond(opexpr->args); + + /* Look for a table */ + if (IsA(arg1, Var)) + { + RangeTblEntry *rte1, + *rte2; + + /* get the RangeTableEntry */ + Var *colvar = (Var *) arg1; + + colvar = get_base_var(colvar, rtables); + + if (!colvar) + return false; + + rte1 = list_nth(rtables, colvar->varno - 1); + + rel_name = get_rel_name(rte1->relid); + col_name = strVal(list_nth(rte1->eref->colnames, + colvar->varattno - 1)); + + /* Look at other argument */ + + /* We may have a cast, try and handle it */ + checkexpr = get_numeric_constant(arg2); + + if (checkexpr != NULL) + arg2 = checkexpr; + + if (IsA(arg2, Const)) + { + /* We have column = literal. Check if partitioned case */ + constant = (Const *) arg2; + + rel_loc_info1 = GetRelationLocInfo(rte1->relid); + + if (!rel_loc_info1) + return false; + + /* If hash partitioned, check if the part column was used */ + if (IsHashColumn(rel_loc_info1, col_name)) + { + /* add to partitioned literal join conditions */ + Literal_Comparison *lit_comp = + palloc(sizeof(Literal_Comparison)); + + lit_comp->relid = rte1->relid; + lit_comp->rel_loc_info = rel_loc_info1; + lit_comp->col_name = col_name; + lit_comp->constant = constant->constvalue; + + conditions->partitioned_literal_comps = lappend( + conditions->partitioned_literal_comps, + lit_comp); + + return true; + } + else + { + /* unimportant comparison, just return */ + if (rel_loc_info1) + FreeRelationLocInfo(rel_loc_info1); + return true; + } + + } + else if (IsA(arg2, Var)) + { + PGXC_Join *pgxc_join; + Var *colvar2 = (Var *) arg2; + + rel_loc_info1 = GetRelationLocInfo(rte1->relid); + + if (!rel_loc_info1) + return false; + + colvar2 = get_base_var(colvar2, rtables); + if (!colvar2) + return false; + rte2 = list_nth(rtables, colvar2->varno - 1); + rel_name2 = get_rel_name(rte2->relid); + rel_loc_info2 = GetRelationLocInfo(rte2->relid); + + /* get data struct about these two relations joining */ + pgxc_join = find_or_create_pgxc_join(rte1->relid, rte1->eref->aliasname, + rte2->relid, rte2->eref->aliasname); + + /* + * pgxc_join->condition_list = + * lappend(pgxc_join->condition_list, opexpr); + */ + + if (rel_loc_info1->locatorType == LOCATOR_TYPE_REPLICATED) + { + /* add to replicated join conditions */ + conditions->replicated_joins = + lappend(conditions->replicated_joins, opexpr); + + if (rel_loc_info2->locatorType != LOCATOR_TYPE_REPLICATED) + { + /* Note other relation, saves us work later. */ + conditions->base_rel_name = rel_name2; + conditions->base_rel_loc_info = rel_loc_info2; + if (rel_loc_info1) + FreeRelationLocInfo(rel_loc_info1); + } + + if (conditions->base_rel_name == NULL) + { + conditions->base_rel_name = rel_name; + conditions->base_rel_loc_info = rel_loc_info1; + if (rel_loc_info2) + FreeRelationLocInfo(rel_loc_info2); + } + + /* note nature of join between the two relations */ + pgxc_join->join_type = JOIN_REPLICATED; + return true; + } + + if (rel_loc_info2->locatorType == LOCATOR_TYPE_REPLICATED) + { + /* add to replicated join conditions */ + conditions->replicated_joins = + lappend(conditions->replicated_joins, opexpr); + + /* other relation not replicated, note it for later */ + conditions->base_rel_name = rel_name; + conditions->base_rel_loc_info = rel_loc_info1; + + /* note nature of join between the two relations */ + pgxc_join->join_type = JOIN_REPLICATED; + + if (rel_loc_info2) + FreeRelationLocInfo(rel_loc_info2); + + return true; + } + + /* Now check for a partitioned join */ + + /* + * PGXCTODO - for the prototype, we assume all partitioned + * tables are on the same nodes. + */ + col_name2 = strVal(list_nth(rte2->eref->colnames, + colvar2->varattno - 1)); + + if (IsHashColumn(rel_loc_info1, col_name) + && IsHashColumn(rel_loc_info2, col_name2)) + { + /* We found a partitioned join */ + conditions->partitioned_parent_child = + lappend(conditions->partitioned_parent_child, + opexpr); + pgxc_join->join_type = JOIN_COLOCATED_PARTITIONED; + return true; + } + + /* + * At this point, there is some other type of join that + * can probably not be executed on only a single node. + * Just return. Important: We preserve previous + * pgxc_join->join_type value, there may be multiple + * columns joining two tables, and we want to make sure at + * least one of them make it colocated partitioned, in + * which case it will update it when examining another + * condition. + */ + return true; + } + else + return true; + + } + } + /* PGXCTODO - need to more finely examine other operators */ + } + + return true; +} + +/* + * examine_conditions_fromlist - Examine FROM clause for joins + * + * Examine FROM clause join conditions to determine special conditions + * to help us decide which nodes to execute on. + */ +static bool +examine_conditions_fromlist(Special_Conditions * conditions, List *rtables, + Node *treenode) +{ + + if (treenode == NULL) + return true; + + if (rtables == NULL) + return true; + + if (conditions == NULL) + conditions = new_special_conditions(); + + if (IsA(treenode, JoinExpr)) + { + JoinExpr *joinexpr = (JoinExpr *) treenode; + + /* recursively examine FROM join tree */ + if (!examine_conditions_fromlist(conditions, rtables, joinexpr->larg)) + return false; + + if (!examine_conditions_fromlist(conditions, rtables, joinexpr->rarg)) + return false; + + /* Now look at join condition */ + if (!examine_conditions(conditions, rtables, joinexpr->quals)) + return false; + return true; + } + else if (IsA(treenode, RangeTblRef)) + { + return true; + } + else if (IsA(treenode, BoolExpr) ||IsA(treenode, OpExpr)) + { + /* check base condition, if possible */ + if (!examine_conditions(conditions, rtables, treenode)) + return false; + } + + /* Some other more complicated beast */ + return false; +} + + +/* + * get_plan_nodes - determine the nodes to execute the command on. + * + * Examines the "special" query conditions in determining execution node list. + * + * returns NULL if it appears to be a mutli-step query. + */ +static List * +get_plan_nodes(Query_Plan * query_plan, Query * query, bool isRead) +{ + RangeTblEntry *rte; + List *test_nodelist; + List *nodelist; + ListCell *lc, + *item; + Special_Conditions *special_conditions; + OpExpr *opexpr; + Var *colvar; + RelationLocInfo *rel_loc_info; + + + nodelist = NULL; + join_list = NULL; + + /* If no tables, just return */ + if (query->rtable == NULL && query->jointree == NULL) + return NULL; + + /* Alloc and init struct */ + special_conditions = new_special_conditions(); + + /* Look for special conditions */ + + /* Look for JOIN syntax joins */ + foreach(item, query->jointree->fromlist) + { + Node *treenode = (Node *) lfirst(item); + + if (IsA(treenode, JoinExpr)) + { + if (!examine_conditions_fromlist(special_conditions, query->rtable, + treenode)) + { + /* if too complicated, just return NULL */ + free_special_relations(special_conditions); + free_join_list(); + return NULL; + } + } + else if (!IsA(treenode, RangeTblRef)) + { + /* could be complicated */ + free_special_relations(special_conditions); + free_join_list(); + return NULL; + } + } + + + /* Examine the WHERE clause, too */ + if (!examine_conditions(special_conditions, query->rtable, + query->jointree->quals)) + { + /* if cross joins may exist, just return NULL */ + free_special_relations(special_conditions); + free_join_list(); + return NULL; + } + + /* Examine join conditions, see if each join is single-node safe */ + if (join_list != NULL) + { + foreach(lc, join_list) + { + PGXC_Join *pgxcjoin = (PGXC_Join *) lfirst(lc); + + /* If it is not replicated or parent-child, not single-node safe */ + if (pgxcjoin->join_type == JOIN_OTHER) + { + free_special_relations(special_conditions); + free_join_list(); + return NULL; + } + } + } + + + /* check for non-partitioned cases */ + if (special_conditions->partitioned_parent_child == NULL && + special_conditions->partitioned_literal_comps == NULL) + { + if (special_conditions->replicated_joins == NULL + && (query->rtable == NULL || query->rtable->length > 1)) + + /* + * This is too complicated for a single step, or there is no FROM + * clause + */ + nodelist = NULL; + else + { + /* + * We have either a single table, just replicated tables, or a + * table that just joins with replicated tables. + */ + + /* See if we noted a table earlier to use */ + rel_loc_info = special_conditions->base_rel_loc_info; + + if (rel_loc_info == NULL) + { + /* a single table, just grab it */ + rte = (RangeTblEntry *) linitial(query->rtable); + rel_loc_info = GetRelationLocInfo(rte->relid); + + if (!rel_loc_info) + return false; + } + + nodelist = GetRelationNodes(rel_loc_info, NULL, isRead); + } + } + /* check for partitioned col comparison against a literal */ + else if (special_conditions->partitioned_literal_comps != NULL + && special_conditions->partitioned_literal_comps->length > 0) + { + nodelist = NULL; + + /* + * Make sure that if there are multiple such comparisons, that they + * are all on the same nodes. + */ + foreach(lc, special_conditions->partitioned_literal_comps) + { + Literal_Comparison *lit_comp = (Literal_Comparison *) lfirst(lc); + + test_nodelist = GetRelationNodes( + lit_comp->rel_loc_info, &(lit_comp->constant), true); + + if (nodelist == NULL) + nodelist = test_nodelist; + else + { + if (nodelist->length > 1 || test_nodelist->length > 1) + /* there should only be one */ + nodelist = NULL; + else + { + /* Make sure they use the same nodes */ + if (linitial_int(test_nodelist) != linitial_int(nodelist)) + nodelist = NULL; + } + } + } + } + else + { + /* + * At this point, we have partitioned parent child relationship, with + * no partitioned column comparison condition with a literal. We just + * use one of the tables as a basis for node determination. + */ + opexpr = (OpExpr *) linitial(special_conditions->partitioned_parent_child); + + colvar = (Var *) linitial(opexpr->args); + + /* get the RangeTableEntry */ + rte = list_nth(query->rtable, colvar->varno - 1); + rel_loc_info = GetRelationLocInfo(rte->relid); + + if (!rel_loc_info) + return false; + + nodelist = GetRelationNodes(rel_loc_info, NULL, isRead); + } + free_special_relations(special_conditions); + free_join_list(); + + return nodelist; +} + + +/* + * get_plan_nodes - determine the nodes to execute the plan on + * + * return NULL if it is not safe to be done in a single step. + */ +static List * +get_plan_nodes_command(Query_Plan * query_plan, Query * query) +{ + + switch (query->commandType) + { + case CMD_SELECT: + return get_plan_nodes(query_plan, query, true); + + case CMD_INSERT: + return get_plan_nodes_insert(query); + + case CMD_UPDATE: + /* treat as a select */ + return get_plan_nodes(query_plan, query, false); + + case CMD_DELETE: + /* treat as a select */ + return get_plan_nodes(query_plan, query, false); + + default: + return NULL; + } +} + + +/* + * Get list of simple aggregates used. + * For now we only allow MAX in the first column, and return a list of one. + */ +static List * +get_simple_aggregates(Query * query, List *nodelist) +{ + List *simple_agg_list = NULL; + + /* Check for simple multi-node aggregate */ + if (nodelist != NULL && nodelist->length > 1 && query->hasAggs) + { + TargetEntry *tle; + + /* + * long term check for group by, but for prototype just allow 1 simple + * expression + */ + if (query->targetList->length != 1) + return NULL; + + tle = (TargetEntry *) linitial(query->targetList); + + if (IsA(tle->expr, Aggref)) + { + SimpleAgg *simple_agg; + Aggref *aggref = (Aggref *) tle->expr; + + /* Just consider numeric max functions for prototype */ + if (!(aggref->aggfnoid >= 2115 && aggref->aggfnoid <= 2121)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Multinode aggregate for this function currently not supported"))); + } + + simple_agg = (SimpleAgg *) palloc(sizeof(SimpleAgg)); + simple_agg->agg_type = AGG_TYPE_MAX; + simple_agg->column_pos = 1; + simple_agg->agg_data_type = aggref->aggtype; + simple_agg->response_count = 0; + + simple_agg_list = lappend(simple_agg_list, simple_agg); + } + } + + return simple_agg_list; +} + + +/* + * Build up a QueryPlan to execute on. + * + * For the prototype, there will only be one step, + * and the nodelist will be NULL if it is not a PGXC-safe statement. + */ +Query_Plan * +GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list) +{ + Query_Plan *query_plan = palloc(sizeof(Query_Plan)); + Query_Step *query_step = palloc(sizeof(Query_Step)); + Query *query; + + + query_plan->force_autocommit = false; + + query_step->sql_statement = (char *) palloc(strlen(sql_statement) + 1); + strcpy(query_step->sql_statement, sql_statement); + query_step->nodelist = NULL; + query_step->simple_aggregates = NULL; + + query_plan->query_step_list = lappend(NULL, query_step); + + /* + * Determine where to execute the command, either at the Coordinator + * level, Data Nodes, or both. By default we choose both. We should be + * able to quickly expand this for more commands. + */ + switch (nodeTag(parsetree)) + { + case T_SelectStmt: + case T_InsertStmt: + case T_UpdateStmt: + case T_DeleteStmt: + /* just use first one in querytree_list */ + query = (Query *) linitial(querytree_list); + query_step->nodelist = + get_plan_nodes_command(query_plan, query); + query_step->simple_aggregates = + get_simple_aggregates(query, query_step->nodelist); + + /* + * See if it is a SELECT with no relations, like SELECT 1+1 or + * SELECT nextval('fred'), and just use coord. + */ + query = (Query *) linitial(querytree_list); + if (query_step->nodelist == NULL + && (query->jointree->fromlist == NULL + || query->jointree->fromlist->length == 0)) + /* Just execute it on Coordinator */ + query_plan->exec_loc_type = EXEC_ON_COORD; + else + { + query_plan->exec_loc_type = EXEC_ON_DATA_NODES; + + if (query_step->nodelist == NULL) + { + bool is_pg_catalog = false; + + /* before giving up, see if we are dealing with pg_catalog */ + if (nodeTag(parsetree) == T_SelectStmt) + { + ListCell *lc; + + is_pg_catalog = true; + foreach(lc, query->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + + /* hack so that pg_catalog queries can run */ + if (rte->relid >= FirstNormalObjectId) + { + is_pg_catalog = false; + break; + } + } + if (is_pg_catalog) + query_plan->exec_loc_type = EXEC_ON_COORD; + } + + /* + * If the nodelist is NULL, it is not safe for us to + * execute + */ + if (!is_pg_catalog && StrictStatementChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Cannot safely execute statement in a single step.")))); + } + } + + /* + * PG-XC cannot yet support some variations of SQL statements. + * We perform some checks to at least catch common cases + */ + + /* + * Check if we have multiple nodes and an unsupported clause. This + * is temporary until we expand supported SQL + */ + if (nodeTag(parsetree) == T_SelectStmt) + { + if (query->intoClause) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("INTO clause not yet supported")))); + + if (query->setOperations) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("UNION, INTERSECT and EXCEPT are not yet supported")))); + + if (query_step->nodelist && query_step->nodelist->length > 1 && StrictStatementChecking) + { + /* + * PGXCTODO - this could be improved to check if the first + * group by expression is the partitioning column + */ + if (query->groupClause) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Multi-node GROUP BY not yet supported")))); + if (query->limitCount && StrictSelectChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Multi-node LIMIT not yet supported")))); + if (query->sortClause && StrictSelectChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Multi-node ORDER BY not yet supported")))); + /* PGXCTODO - check if first column partitioning column */ + if (query->distinctClause) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Multi-node DISTINCT`not yet supported")))); + if (query->hasAggs) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Multi-node aggregates not yet supported")))); + } + } + break; + + /* Statements that we only want to execute on the Coordinator */ + case T_AlterSeqStmt: + case T_CommentStmt: + case T_CreateSeqStmt: + case T_VariableShowStmt: + query_plan->exec_loc_type = EXEC_ON_COORD; + break; + + /* DROP */ + case T_DropStmt: + if (((DropStmt *) parsetree)->removeType == OBJECT_SEQUENCE) + query_plan->exec_loc_type = EXEC_ON_COORD; + else + query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; + break; + + /* + * Statements that need to run in autocommit mode, on Coordinator + * and Data Nodes with suppressed implicit two phase commit. + */ + case T_CheckPointStmt: + case T_ClusterStmt: + case T_CreatedbStmt: + case T_DropdbStmt: + case T_VacuumStmt: + query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; + query_plan->force_autocommit = true; + break; + + /* + * Statements that we execute on both the Coordinator and Data Nodes + */ + case T_AlterTableStmt: + case T_AlterDatabaseStmt: + case T_AlterDatabaseSetStmt: + case T_AlterDomainStmt: + case T_AlterObjectSchemaStmt: + case T_ConstraintsSetStmt: + case T_CreateDomainStmt: + case T_CreateEnumStmt: + case T_CreateStmt: + case T_CreateSchemaStmt: + case T_DeallocateStmt: /* Allow for DEALLOCATE ALL */ + case T_DiscardStmt: + case T_IndexStmt: + case T_LockStmt: + case T_ReindexStmt: + case T_RenameStmt: + case T_TruncateStmt: + case T_VariableSetStmt: + + /* + * Also support these, should help later with pg_restore, although + * not very useful because of the pooler using the same user + */ + case T_GrantStmt: + case T_GrantRoleStmt: + case T_CreateRoleStmt: + case T_AlterRoleStmt: + case T_DropRoleStmt: + case T_AlterOwnerStmt: + case T_DropOwnedStmt: + case T_ReassignOwnedStmt: + query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; + break; + + + case T_TransactionStmt: + switch (((TransactionStmt *) parsetree)->kind) + { + case TRANS_STMT_SAVEPOINT: + case TRANS_STMT_RELEASE: + case TRANS_STMT_ROLLBACK_TO: + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("This type of transaction statement not yet supported")))); + break; + + default: + break; /* keep compiler quiet */ + } + query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; + break; + + /* + * For now, pick one of the data nodes until we modify real + * planner It will give an approximate idea of what an isolated + * data node will do + */ + case T_ExplainStmt: + query_step->nodelist = lappend_int(query_step->nodelist, GetAnyDataNode()); + query_plan->exec_loc_type = EXEC_ON_DATA_NODES; + break; + + /* + * Statements we do not yet want to handle. + * By default they would be fobidden, but we list these for reference. + * Note that there is not a 1-1 correspndence between + * SQL command and the T_*Stmt structures. + */ + case T_AlterFdwStmt: + case T_AlterForeignServerStmt: + case T_AlterFunctionStmt: + case T_AlterOpFamilyStmt: + case T_AlterTSConfigurationStmt: + case T_AlterTSDictionaryStmt: + case T_AlterUserMappingStmt: + case T_ClosePortalStmt: + case T_CompositeTypeStmt: + case T_CreateCastStmt: + case T_CreateConversionStmt: + case T_CreateFdwStmt: + case T_CreateFunctionStmt: + case T_CreateForeignServerStmt: + case T_CreateOpClassStmt: + case T_CreateOpFamilyStmt: + case T_CreatePLangStmt: + case T_CreateTableSpaceStmt: + case T_CreateTrigStmt: + case T_CreateUserMappingStmt: + case T_DeclareCursorStmt: + case T_DefineStmt: /* used for aggregates, some types */ + case T_DropCastStmt: + case T_DropFdwStmt: + case T_DropForeignServerStmt: + case T_DropPLangStmt: + case T_DropPropertyStmt: + case T_DropTableSpaceStmt: + case T_ExecuteStmt: + case T_FetchStmt: + case T_ListenStmt: + case T_LoadStmt: + case T_NotifyStmt: + case T_PrepareStmt: + case T_RemoveFuncStmt: + case T_RemoveOpClassStmt: + case T_RemoveOpFamilyStmt: + case T_RuleStmt: + case T_UnlistenStmt: + case T_ViewStmt: + /* fall through */ + default: + /* Allow for override */ + if (StrictStatementChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("This command is not yet supported.")))); + else + query_plan->exec_loc_type = EXEC_ON_COORD | EXEC_ON_DATA_NODES; + break; + } + + + return query_plan; +} + + +/* + * Free Query_Step struct + */ +static void +free_query_step(Query_Step * query_step) +{ + if (query_step == NULL) + return; + + pfree(query_step->sql_statement); + list_free(query_step->nodelist); + if (query_step->simple_aggregates != NULL) + list_free_deep(query_step->simple_aggregates); + pfree(query_step); +} + +/* + * Free Query_Plan struct + */ +void +FreeQueryPlan(Query_Plan * query_plan) +{ + ListCell *item; + + if (query_plan == NULL) + return; + + foreach(item, query_plan->query_step_list) + { + free_query_step((Query_Step *) lfirst_int(item)); + } + + pfree(query_plan->query_step_list); + pfree(query_plan); +} diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile new file mode 100644 index 0000000000..7143af5d97 --- /dev/null +++ b/src/backend/pgxc/pool/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for pool +# +# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/pool +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = combiner.o datanode.o poolmgr.o poolcomm.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/pool/combiner.c b/src/backend/pgxc/pool/combiner.c new file mode 100644 index 0000000000..da59c5f6af --- /dev/null +++ b/src/backend/pgxc/pool/combiner.c @@ -0,0 +1,375 @@ +/*------------------------------------------------------------------------- + * + * combiner.c + * + * Combine responses from multiple Data Nodes + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "pgxc/combiner.h" +#include "pgxc/planner.h" +#include "catalog/pg_type.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "utils/builtins.h" + + +/* + * Create a structure to store parameters needed to combine responses from + * multiple connections as well as state information + */ +ResponseCombiner +CreateResponseCombiner(int node_count, CombineType combine_type, + CommandDest dest) +{ + ResponseCombiner combiner; + + /* ResponseComber is a typedef for pointer to ResponseCombinerData */ + combiner = (ResponseCombiner) palloc(sizeof(ResponseCombinerData)); + if (combiner == NULL) + { + /* Out of memory */ + return combiner; + } + + combiner->node_count = node_count; + combiner->combine_type = combine_type; + combiner->dest = dest; + combiner->command_complete_count = 0; + combiner->row_count = 0; + combiner->request_type = REQUEST_TYPE_NOT_DEFINED; + combiner->description_count = 0; + combiner->simple_aggregates = NULL; + + return combiner; +} + +/* + * Parse out row count from the command status response and convert it to integer + */ +static int +parse_row_count(const char *message, size_t len, int *rowcount) +{ + int digits = 0; + + *rowcount = 0; + /* skip \0 string terminator */ + len--; + while (len-- > 0 && message[len] >= '0' && message[len] <= '9') + { + *rowcount = *rowcount * 10 + message[len] - '0'; + digits++; + } + return digits; +} + +/* + * Extract the aggregate element result + * returns a boolean indicating whether or not it was a short message + */ +static unsigned long +parse_aggregate_value(SimpleAgg * simple_agg, char *msg_body, size_t len) +{ + char *valstr; + + Assert(len >= 7); + + /* PGXCTODO - handle pos (position) */ + /* PGXCTODO - handle other types like TEXT */ + + /* skip first 2 bytes */ + if (simple_agg->data_len == 0) + memcpy(&(simple_agg->data_len), &(msg_body[2]), 4); + + valstr = (char *) palloc(simple_agg->data_len + 1); + strncpy(valstr, &(msg_body[6]), simple_agg->data_len); + valstr[simple_agg->data_len - 1] = '\0'; + + return atol(valstr); +} + + +/* + * Process a result from a node for the aggregate function + * returns a boolean indicating whether or not it was a short message + */ +static void +process_aggregate_element(List *simple_aggregates, char *msg_body, size_t len) +{ + ListCell *lc; + + foreach(lc, simple_aggregates) + { + unsigned long col_value; + SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc); + + /* PGXCTODO may need to support numeric, too. */ + col_value = parse_aggregate_value(simple_agg, msg_body, len); + + switch (simple_agg->agg_type) + { + case AGG_TYPE_MAX: + /* If it is the first one, take it */ + if (simple_agg->response_count == 0) + { + /* PGXCTODO - type checking */ + simple_agg->ulong_value = col_value; + } + else + { + if (col_value > simple_agg->ulong_value) + simple_agg->ulong_value = col_value; + } + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Unknown aggregate type: %d", + simple_agg->agg_type))); + } + + } +} + + +/* + * Handle response message and update combiner's state. + * This function contains main combiner logic + */ +int +CombineResponse(ResponseCombiner combiner, char msg_type, char *msg_body, size_t len) +{ + int rowcount; + int digits = 0; + + switch (msg_type) + { + case 'C': /* CommandComplete */ + /* + * If we did not receive description we are having rowcount or OK + * response + */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COMMAND; + /* Extract rowcount */ + if (combiner->combine_type != COMBINE_TYPE_NONE) + { + digits = parse_row_count(msg_body, len, &rowcount); + if (digits > 0) + combiner->row_count += rowcount; + else + combiner->combine_type = COMBINE_TYPE_NONE; + } + if (++combiner->command_complete_count == combiner->node_count) + { + + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + { + if (combiner->combine_type == COMBINE_TYPE_NONE) + { + pq_putmessage(msg_type, msg_body, len); + } + else + { + char command_complete_buffer[256]; + + rowcount = combiner->combine_type == COMBINE_TYPE_SUM ? + combiner->row_count : + combiner->row_count / combiner->node_count; + /* Truncate msg_body to get base string */ + msg_body[len - digits - 1] = '\0'; + len = sprintf(command_complete_buffer, "%s%d", msg_body, rowcount) + 1; + pq_putmessage(msg_type, command_complete_buffer, len); + } + } + } + break; + case 'T': /* RowDescription */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_QUERY; + if (combiner->request_type != REQUEST_TYPE_QUERY) + { + /* Inconsistent responses */ + return EOF; + } + /* Proxy first */ + if (combiner->description_count++ == 0) + { + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + pq_putmessage(msg_type, msg_body, len); + } + break; + case 'G': /* CopyInResponse */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_IN; + if (combiner->request_type != REQUEST_TYPE_COPY_IN) + { + /* Inconsistent responses */ + return EOF; + } + /* Proxy first */ + if (combiner->description_count++ == 0) + { + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + pq_putmessage(msg_type, msg_body, len); + } + break; + case 'H': /* CopyOutResponse */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_OUT; + if (combiner->request_type != REQUEST_TYPE_COPY_OUT) + { + /* Inconsistent responses */ + return EOF; + } + /* Proxy first */ + if (combiner->description_count++ == 0) + { + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + pq_putmessage(msg_type, msg_body, len); + } + break; + case 'D': /* DataRow */ + if (combiner->simple_aggregates == NULL) + { + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + pq_putmessage(msg_type, msg_body, len); + } + else + { + SimpleAgg *simple_agg = (SimpleAgg *) linitial(combiner->simple_aggregates); + + /* Handle aggregates */ + /* Process single node result */ + process_aggregate_element( + combiner->simple_aggregates, + msg_body, len); + + /* + * See if we are done with all nodes. Only then do we send one + * DataRow result. + */ + + if (++simple_agg->response_count + == combiner->node_count) + { + char longstr[21]; + int longlen; + + StringInfo data_buffer; + + data_buffer = makeStringInfo(); + + /* + * longlen = sprintf(longstr, "%lu", + * simple_agg->ulong_value); + */ + + pg_ltoa(simple_agg->ulong_value, longstr); + longlen = strlen(longstr); + + pq_beginmessage(data_buffer, 'D'); + pq_sendbyte(data_buffer, msg_body[0]); + pq_sendbyte(data_buffer, msg_body[1]); + pq_sendint(data_buffer, longlen, 4); + pq_sendtext(data_buffer, longstr, longlen); + pq_putmessage(msg_type, + data_buffer->data, + data_buffer->len); + + pfree(data_buffer->data); + pfree(data_buffer); + } + } + break; + case 'E': /* ErrorResponse */ + case 'A': /* NotificationResponse */ + case 'N': /* NoticeResponse */ + /* Always proxy */ + if (combiner->dest == DestRemote + || combiner->dest == DestRemoteExecute) + pq_putmessage(msg_type, msg_body, len); + break; + case 'I': /* EmptyQuery */ + default: + /* Unexpected message */ + return EOF; + } + return 0; +} + +/* + * Examine the specified combiner state and determine if command was completed + * successfully + */ +static bool +validate_combiner(ResponseCombiner combiner) +{ + /* Check all nodes completed */ + if (combiner->command_complete_count != combiner->node_count) + return false; + + /* Check count of description responses */ + if (combiner->request_type != REQUEST_TYPE_COMMAND + && combiner->description_count != combiner->node_count) + return false; + + /* Add other checks here as needed */ + + /* All is good if we are here */ + return true; +} + +/* + * Validate combiner and release storage freeing allocated memory + */ +bool +ValidateAndCloseCombiner(ResponseCombiner combiner) +{ + bool valid = validate_combiner(combiner); + + pfree(combiner); + + return valid; +} + +/* + * Validate combiner and reset storage + */ +bool +ValidateAndResetCombiner(ResponseCombiner combiner) +{ + bool valid = validate_combiner(combiner); + + combiner->command_complete_count = 0; + combiner->row_count = 0; + combiner->request_type = REQUEST_TYPE_NOT_DEFINED; + combiner->description_count = 0; + combiner->simple_aggregates = NULL; + + return valid; +} + +/* + * Assign combiner aggregates + */ +void +AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates) +{ + combiner->simple_aggregates = simple_aggregates; +} diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c new file mode 100644 index 0000000000..9b3d40a785 --- /dev/null +++ b/src/backend/pgxc/pool/datanode.c @@ -0,0 +1,1701 @@ +/*------------------------------------------------------------------------- + * + * datanode.c + * + * Functions for the coordinator communicating with the data nodes + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + * + *------------------------------------------------------------------------- + */ + +#include <sys/select.h> +#include <sys/time.h> +#include <sys/types.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include "pgxc/poolmgr.h" +#include "access/gtm.h" +#include "access/transam.h" +#include "access/xact.h" +#include "postgres.h" +#include "utils/snapmgr.h" +#include "gtm/gtm_c.h" +#include "pgxc/datanode.h" +#include "../interfaces/libpq/libpq-fe.h" +#include "utils/elog.h" +#include "utils/memutils.h" + + +#define NO_SOCKET -1 + +static int node_count = 0; +static DataNodeHandle *handles = NULL; +static bool autocommit = true; +static DataNodeHandle **write_node_list = NULL; +static int write_node_count = 0; + +static DataNodeHandle **get_handles(List *nodelist); +static int get_transaction_nodes(DataNodeHandle ** connections); +static void release_handles(void); + +static void data_node_init(DataNodeHandle * handle, int sock); +static void data_node_free(DataNodeHandle * handle); + +static int data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid); +static int data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner); +static int data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner); + +static int ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle); +static int ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle); + +static int data_node_send_query(DataNodeHandle * handle, const char *query); +static int data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid); +static int data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot); + +static void add_error_message(DataNodeHandle * handle, const char *message); + +static int data_node_read_data(DataNodeHandle * conn); +static int handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState); + +static int get_int(DataNodeHandle * conn, size_t len, int *out); +static int get_char(DataNodeHandle * conn, char *out); + +static void clear_write_node_list(); + +#define MAX_STATEMENTS_PER_TRAN 10 + +/* Variables to collect statistics */ +static int total_transactions = 0; +static int total_statements = 0; +static int total_autocommit = 0; +static int nonautocommit_2pc = 0; +static int autocommit_2pc = 0; +static int current_tran_statements = 0; +static int *statements_per_transaction = NULL; +static int *nodes_per_transaction = NULL; + +/* + * statistics collection: count a statement + */ +static void +stat_statement() +{ + total_statements++; + current_tran_statements++; +} + +/* + * To collect statistics: count a transaction + */ +static void +stat_transaction(int node_count) +{ + total_transactions++; + if (autocommit) + total_autocommit++; + if (!statements_per_transaction) + { + statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + } + if (current_tran_statements > MAX_STATEMENTS_PER_TRAN) + statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++; + else + statements_per_transaction[current_tran_statements]++; + current_tran_statements = 0; + if (node_count > 0 && node_count <= NumDataNodes) + { + if (!nodes_per_transaction) + { + nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int)); + memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int)); + } + nodes_per_transaction[node_count - 1]++; + } +} + + +/* + * To collect statistics: count a two-phase commit on nodes + */ +static void +stat_2pc() +{ + if (autocommit) + autocommit_2pc++; + else + nonautocommit_2pc++; +} + + +/* + * Output collected statistics to the log + */ +static void +stat_log() +{ + elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements); + elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d", + total_autocommit, autocommit_2pc, nonautocommit_2pc); + if (total_transactions) + { + if (statements_per_transaction) + { + int i; + + for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++) + elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)", + i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions); + } + elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)", + MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions); + if (nodes_per_transaction) + { + int i; + + for (i = 0; i < NumDataNodes; i++) + elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)", + i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions); + } + } +} + +/* + * Allocate and initialize memory to store DataNode handles. + */ +void +InitMultinodeExecutor() +{ + int i; + + /* This function could get called multiple times because of sigjmp */ + if (handles != NULL) + return; + + /* + * Should be in TopMemoryContext. + * Assume the caller takes care of context switching + */ + handles = (DataNodeHandle *) palloc(NumDataNodes * sizeof(DataNodeHandle)); + if (!handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* initialize storage then */ + for (i = 0; i < NumDataNodes; i++) + { + /* + * Socket descriptor is small non-negative integer, + * Indicate the handle is not initialized yet + */ + handles[i].sock = NO_SOCKET; + + /* Initialise buffers */ + handles[i].error = NULL; + handles[i].outSize = 16 * 1024; + handles[i].outBuffer = (char *) palloc(handles[i].outSize); + handles[i].inSize = 16 * 1024; + handles[i].inBuffer = (char *) palloc(handles[i].inSize); + + if (handles[i].outBuffer == NULL || handles[i].inBuffer == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + } + + node_count = 0; +} + +/* + * Builds up a connection string + */ +char * +DataNodeConnStr(char *host, char *port, char *dbname, + char *user, char *password) +{ + char *out, + connstr[256]; + int num; + + /* Build up connection string */ + num = snprintf(connstr, sizeof(connstr), + "host=%s port=%s dbname=%s user=%s password=%s", + host, port, dbname, user, password); + + /* Check for overflow */ + if (num > 0 && num < sizeof(connstr)) + { + /* Output result */ + out = (char *) palloc(num + 1); + strcpy(out, connstr); + return out; + } + + /* return NULL if we have problem */ + return NULL; +} + + +/* + * Connect to a Data Node using a connection string + */ +NODE_CONNECTION * +DataNodeConnect(char *connstr) +{ + PGconn *conn; + + /* Delegate call to the pglib */ + conn = PQconnectdb(connstr); + return (NODE_CONNECTION *) conn; +} + + +/* + * Close specified connection + */ +void +DataNodeClose(NODE_CONNECTION * conn) +{ + /* Delegate call to the pglib */ + PQfinish((PGconn *) conn); +} + + +/* + * Checks if connection active + */ +int +DataNodeConnected(NODE_CONNECTION * conn) +{ + /* Delegate call to the pglib */ + PGconn *pgconn = (PGconn *) conn; + + /* + * Simple check, want to do more comprehencive - + * check if it is ready for guery + */ + return pgconn && PQstatus(pgconn) == CONNECTION_OK; +} + + + +/* Close the socket handle (this process' copy) and free occupied memory + * + * Note that we do not free the handle and its members. This will be + * taken care of when the transaction ends, when TopTransactionContext + * is destroyed in xact.c. + */ +static void +data_node_free(DataNodeHandle * handle) +{ + close(handle->sock); + handle->sock = NO_SOCKET; +} + + +/* + * Create and initialise internal structure to communicate to + * Data Node via supplied socket descriptor. + * Structure stores state info and I/O buffers + */ +static void +data_node_init(DataNodeHandle * handle, int sock) +{ + handle->sock = sock; + handle->transaction_status = 'I'; + handle->state = DN_CONNECTION_STATE_IDLE; + handle->error = NULL; + handle->outEnd = 0; + handle->inStart = 0; + handle->inEnd = 0; + handle->inCursor = 0; +} + + +/* + * Handle responses from the Data node connections + */ +static int +data_node_receive_responses(int conn_count, DataNodeHandle ** connections, + struct timeval * timeout, ResponseCombiner combiner) +{ + int result = 0; + int retry_count; + bool timed_out = false; + bool inErrorState = false; + + int count = conn_count; + DataNodeHandle *to_receive[conn_count]; + + /* make a copy of the pointers to the connections */ + memcpy(to_receive, connections, conn_count * sizeof(DataNodeHandle *)); + + /* + * Read results. + * Note we try and read from data node connections even if there is an error on one, + * so as to avoid reading incorrect results on the next statement. + * It might be better to just destroy these connections and tell the pool manager. + */ + while (count > 0) + { + int i, + res_select, + nfds = 0; + fd_set readfds; + + FD_ZERO(&readfds); + for (i = 0; i < count; i++) + { + /* note if a connection has error */ + if (!to_receive[i] + || to_receive[i]->state == DN_CONNECTION_STATE_ERROR + || to_receive[i]->sock >= 1024) + { + result = EOF; + + /* Handling is done, do not track this connection */ + count--; + + /* Move last connection in its place */ + if (i < count) + { + to_receive[i] = to_receive[count]; + /* stay on the current position */ + i--; + } + continue; + } + + /* prepare select params */ + if (nfds < to_receive[i]->sock) + nfds = to_receive[i]->sock; + + FD_SET (to_receive[i]->sock, &readfds); + } + + /* Make sure we still have valid connections */ + if (count == 0) + break; + + retry_count = 0; +retry: + res_select = select(nfds + 1, &readfds, NULL, NULL, timeout); + if (res_select < 0) + { + /* error - retry if EINTR or EAGAIN */ + if (errno == EINTR || errno == EAGAIN) + goto retry; + + /* + * PGXCTODO - we may want to close the connections and notify the + * pooler that these are invalid. + */ + if (errno == EBADF) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("select() bad file descriptor set"))); + return EOF; + } + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("select() error: %d", errno))); + return EOF; + } + + if (res_select == 0) + { + /* Handle timeout */ + result = EOF; + timed_out = true; + } + + /* read data */ + for (i = 0; i < count; i++) + { + DataNodeHandle *conn = to_receive[i]; + + if (FD_ISSET(conn->sock, &readfds)) + { + int read_status = data_node_read_data(conn); + + if (read_status == EOF || read_status < 0) + { + count--; + /* Move last connection in place */ + if (i < count) + { + to_receive[i] = to_receive[count]; + /* stay on the current position */ + i--; + } + + inErrorState = true; + result = EOF; + continue; + } + } + + if (conn->inStart < conn->inEnd) + { + if (handle_response(conn, combiner, inErrorState) == 0) + { + /* Handling is done, do not track this connection */ + count--; + /* Move last connection in place */ + if (i < count) + { + to_receive[i] = to_receive[count]; + /* stay on the current position */ + i--; + } + } + + /* + * See if we flagged an error on connection. Note, if + * handle_response was not 0 above, an error occurred, we + * still need to consume the ReadyForQuery message + */ + if (conn->state == DN_CONNECTION_STATE_ERROR) + { + inErrorState = true; + result = EOF; + } + } + } + } + + return result; +} + +/* + * Read up incoming messages from the Data ndoe connection + */ +static int +data_node_read_data(DataNodeHandle * conn) +{ + int someread = 0; + int nread; + + if (conn->sock < 0) + { + add_error_message(conn, "bad socket"); + return EOF; + } + + /* Left-justify any data in the buffer to make room */ + if (conn->inStart < conn->inEnd) + { + if (conn->inStart > 0) + { + memmove(conn->inBuffer, conn->inBuffer + conn->inStart, + conn->inEnd - conn->inStart); + conn->inEnd -= conn->inStart; + conn->inCursor -= conn->inStart; + conn->inStart = 0; + } + } + else + { + /* buffer is logically empty, reset it */ + conn->inStart = conn->inCursor = conn->inEnd = 0; + } + + /* + * If the buffer is fairly full, enlarge it. We need to be able to enlarge + * the buffer in case a single message exceeds the initial buffer size. We + * enlarge before filling the buffer entirely so as to avoid asking the + * kernel for a partial packet. The magic constant here should be large + * enough for a TCP packet or Unix pipe bufferload. 8K is the usual pipe + * buffer size, so... + */ + if (conn->inSize - conn->inEnd < 8192) + { + if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0) + { + /* + * We don't insist that the enlarge worked, but we need some room + */ + if (conn->inSize - conn->inEnd < 100) + { + add_error_message(conn, "can not allocate buffer"); + return -1; + } + } + } + +retry: + nread = recv(conn->sock, conn->inBuffer + conn->inEnd, + conn->inSize - conn->inEnd, 0); + + if (nread < 0) + { + elog(DEBUG1, "dnrd errno = %d", errno); + if (errno == EINTR) + goto retry; + /* Some systems return EAGAIN/EWOULDBLOCK for no data */ +#ifdef EAGAIN + if (errno == EAGAIN) + return someread; +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + if (errno == EWOULDBLOCK) + return someread; +#endif + /* We might get ECONNRESET here if using TCP and backend died */ +#ifdef ECONNRESET + if (errno == ECONNRESET) + { + /* + * OK, we are getting a zero read even though select() says ready. This + * means the connection has been closed. Cope. + */ + add_error_message(conn, + "data node closed the connection unexpectedly\n" + "\tThis probably means the data node terminated abnormally\n" + "\tbefore or while processing the request.\n"); + conn->state = DN_CONNECTION_STATE_ERROR; /* No more connection to + * backend */ + closesocket(conn->sock); + conn->sock = NO_SOCKET; + + return -1; + } +#endif + add_error_message(conn, "could not receive data from server"); + return -1; + + } + + if (nread > 0) + { + conn->inEnd += nread; + + /* + * Hack to deal with the fact that some kernels will only give us back + * 1 packet per recv() call, even if we asked for more and there is + * more available. If it looks like we are reading a long message, + * loop back to recv() again immediately, until we run out of data or + * buffer space. Without this, the block-and-restart behavior of + * libpq's higher levels leads to O(N^2) performance on long messages. + * + * Since we left-justified the data above, conn->inEnd gives the + * amount of data already read in the current message. We consider + * the message "long" once we have acquired 32k ... + */ + if (conn->inEnd > 32768 && + (conn->inSize - conn->inEnd) >= 8192) + { + someread = 1; + goto retry; + } + return 1; + } + + if (nread == 0) + { + elog(DEBUG1, "nread returned 0"); + return EOF; + } + + if (someread) + return 1; /* got a zero read after successful tries */ + + return 0; +} + +/* + * Get one character from the connection buffer and advance cursor + */ +static int +get_char(DataNodeHandle * conn, char *out) +{ + if (conn->inCursor < conn->inEnd) + { + *out = conn->inBuffer[conn->inCursor++]; + return 0; + } + return EOF; +} + +/* + * Read an integer from the connection buffer and advance cursor + */ +static int +get_int(DataNodeHandle * conn, size_t len, int *out) +{ + unsigned short tmp2; + unsigned int tmp4; + + if (conn->inCursor + len > conn->inEnd) + return EOF; + + switch (len) + { + case 2: + memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2); + conn->inCursor += 2; + *out = (int) ntohs(tmp2); + break; + case 4: + memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4); + conn->inCursor += 4; + *out = (int) ntohl(tmp4); + break; + default: + add_error_message(conn, "not supported int size"); + return EOF; + } + + return 0; +} + +/* + * Read next message from the connection and update the combiner accordingly + * If we are in an error state we just consume the messages, and do not proxy + * Long term, we should look into cancelling executing statements + * and closing the connections. + */ +static int +handle_response(DataNodeHandle * conn, ResponseCombiner combiner, bool inErrorState) +{ + char msg_type; + int msg_len; + bool connError = false; + + for (;;) + { + /* try to read the message, return if not enough data */ + conn->inCursor = conn->inStart; + if (conn->inEnd - conn->inCursor < 5) + return EOF; + + if (get_char(conn, &msg_type)) + return EOF; + + if (get_int(conn, 4, &msg_len)) + return EOF; + + msg_len -= 4; + + if (conn->inEnd - conn->inCursor < msg_len) + { + ensure_in_buffer_capacity(conn->inCursor + (size_t) msg_len, conn); + return EOF; + } + + /* TODO handle other possible responses */ + switch (msg_type) + { + case 'C': /* CommandComplete */ + /* no need to parse, just move cursor */ + conn->inCursor += msg_len; + conn->state = DN_CONNECTION_STATE_COMPLETED; + if (!inErrorState) + CombineResponse(combiner, msg_type, + conn->inBuffer + conn->inStart + 5, + conn->inCursor - conn->inStart - 5); + + break; + case 'T': /* RowDescription */ + case 'G': /* CopyInResponse */ + case 'H': /* CopyOutResponse */ + case 'D': /* DataRow */ + /* no need to parse, just move cursor */ + conn->inCursor += msg_len; + if (!inErrorState) + CombineResponse(combiner, msg_type, + conn->inBuffer + conn->inStart + 5, + conn->inCursor - conn->inStart - 5); + break; + case 'E': /* ErrorResponse */ + /* no need to parse, just move cursor */ + conn->inCursor += msg_len; + if (!inErrorState) + CombineResponse(combiner, msg_type, + conn->inBuffer + conn->inStart + 5, + conn->inCursor - conn->inStart - 5); + conn->inStart = conn->inCursor; + connError = inErrorState = true; + /* conn->state = DN_CONNECTION_STATE_ERROR; */ + + /* + * Do not return with an error, we still need to consume Z, + * ready-for-query + */ + break; + case 'A': /* NotificationResponse */ + case 'N': /* NoticeResponse */ + conn->inCursor += msg_len; + + /* + * Ignore these to prevent multiple messages, one from each + * node. Coordinator will send one for DDL anyway + */ + break; + case 'Z': /* ReadyForQuery */ + get_char(conn, &conn->transaction_status); + conn->state = DN_CONNECTION_STATE_IDLE; + conn->inStart = conn->inCursor; + /* Now it is ok to flag the connection as having an error */ + if (connError) + { + conn->state = DN_CONNECTION_STATE_ERROR; + return EOF; + } + return 0; + case 'I': /* EmptyQuery */ + default: + /* sync lost? */ + conn->state = DN_CONNECTION_STATE_ERROR; + inErrorState = true; + return EOF; + } + conn->inStart = conn->inCursor; + + } + return EOF; +} + + +/* + * Send BEGIN command to the Data nodes and receive responses + */ +static int +data_node_begin(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner, GlobalTransactionId gxid) +{ + int i; + struct timeval *timeout = NULL; + + /* Send BEGIN */ + for (i = 0; i < conn_count; i++) + { + if (GlobalTransactionIdIsValid(gxid) && data_node_send_gxid(connections[i], gxid)) + return EOF; + + if (data_node_send_query(connections[i], "BEGIN")) + return EOF; + } + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; + + /* Verify status? */ + + return 0; +} + + +/* Clears the write node list */ +static void +clear_write_node_list() +{ + /* we just malloc once and use counter */ + if (write_node_list == NULL) + { + write_node_list = (DataNodeHandle **) malloc(NumDataNodes * sizeof(DataNodeHandle *)); + } + write_node_count = 0; +} + + +/* + * Switch autocommmit mode off, so all subsequent statements will be in the same transaction + */ +void +DataNodeBegin(void) +{ + autocommit = false; + clear_write_node_list(); +} + + +/* + * Commit current transaction, use two-phase commit if necessary + */ +int +DataNodeCommit(CommandDest dest) +{ + int res; + int tran_count; + DataNodeHandle *connections[node_count]; + ResponseCombiner combiner; + + /* Quick check to make sure we have connections */ + if (node_count == 0) + goto finish; + + /* gather connections to commit */ + tran_count = get_transaction_nodes(connections); + + /* + * If we do not have open transactions we have nothing to commit, just + * report success + */ + if (tran_count == 0) + goto finish; + + combiner = CreateResponseCombiner(tran_count, + COMBINE_TYPE_NONE, dest); + res = data_node_commit(tran_count, connections, combiner); + if (!ValidateAndCloseCombiner(combiner) || res) + return EOF; + +finish: + /* In autocommit mode statistics is collected in DataNodeExec */ + if (!autocommit) + stat_transaction(node_count); + if (!PersistentConnections) + release_handles(); + autocommit = true; + clear_write_node_list(); + return 0; +} + + +/* + * Send COMMIT or PREPARE/COMMIT PREPARED down to the Data nodes and handle responses + */ +static int +data_node_commit(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner) +{ + int i; + struct timeval *timeout = NULL; + char buffer[256]; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + int result = 0; + + + /* can set this to false to disable temporarily */ + /* bool do2PC = conn_count > 1; */ + + /* + * Only use 2PC if more than one node was written to. Otherwise, just send + * COMMIT to all + */ + bool do2PC = write_node_count > 1; + + /* Extra XID for Two Phase Commit */ + GlobalTransactionId two_phase_xid = 0; + + if (do2PC) + { + stat_2pc(); + + /* + * Formally we should be using GetCurrentGlobalTransactionIdIfAny() here, + * but since we need 2pc, we surely have sent down a command and got + * gxid for it. Hence GetCurrentGlobalTransactionId() just returns + * already allocated gxid + */ +/* #ifdef PGXC_COORD */ + gxid = GetCurrentGlobalTransactionId(); +/* #endif */ + + sprintf(buffer, "PREPARE TRANSACTION 'T%d'", gxid); + /* Send PREPARE */ + for (i = 0; i < conn_count; i++) + { + if (data_node_send_query(connections[i], buffer)) + return EOF; + } + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; + + /* Reset combiner */ + if (!ValidateAndResetCombiner(combiner)) + return EOF; + } + + if (!do2PC) + strcpy(buffer, "COMMIT"); + else + { + sprintf(buffer, "COMMIT PREPARED 'T%d'", gxid); + + /* We need to use a new xid, the data nodes have reset */ + two_phase_xid = BeginTranGTM(); + for (i = 0; i < conn_count; i++) + { + if (data_node_send_gxid(connections[i], two_phase_xid)) + { + add_error_message(connections[i], "Can not send request"); + result = EOF; + goto finish; + } + } + } + + /* Send COMMIT */ + for (i = 0; i < conn_count; i++) + { + if (data_node_send_query(connections[i], buffer)) + { + result = EOF; + goto finish; + } + } + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; + +finish: + if (do2PC) + CommitTranGTM((GlobalTransactionId) two_phase_xid); + + return result; +} + + +/* + * Rollback current transaction + */ +int +DataNodeRollback(CommandDest dest) +{ + int res = 0; + int tran_count; + DataNodeHandle *connections[node_count]; + ResponseCombiner combiner; + int i; + + /* Quick check to make sure we have connections */ + if (node_count == 0) + goto finish; + + /* gather connections to rollback */ + tran_count = get_transaction_nodes(connections); + + /* + * If we do not have open transactions we have nothing to rollback just + * report success + */ + if (tran_count == 0) + goto finish; + + combiner = CreateResponseCombiner(tran_count, + COMBINE_TYPE_NONE, dest); + res = data_node_rollback(tran_count, connections, combiner); + + /* Assume connection got cleaned up. Reset so we can reuse without error. */ + for (i = 0; i < tran_count; i++) + { + connections[i]->transaction_status = 'I'; + connections[i]->state = DN_CONNECTION_STATE_IDLE; + } + + if (!ValidateAndCloseCombiner(combiner) || res) + res = EOF; + +finish: + /* In autocommit mode statistics is collected in DataNodeExec */ + if (!autocommit) + stat_transaction(node_count); + if (!PersistentConnections) + release_handles(); + autocommit = true; + clear_write_node_list(); + return res; +} + + +/* Release all data node connections back to pool and release occupied memory */ +static void +release_handles(void) +{ + int i; + + if (node_count == 0) + return; + + PoolManagerReleaseConnections(); + for (i = 0; i < NumDataNodes; i++) + { + DataNodeHandle *handle = &handles[i]; + + if (handle->sock != NO_SOCKET) + data_node_free(handle); + } + + node_count = 0; +} + + +/* + * Send ROLLBACK command down to the Data nodes and handle responses + */ +static int +data_node_rollback(int conn_count, DataNodeHandle ** connections, ResponseCombiner combiner) +{ + int i; + struct timeval *timeout = NULL; + int result = 0; + + /* Send ROLLBACK - */ + for (i = 0; i < conn_count; i++) + { + if (data_node_send_query(connections[i], "ROLLBACK")) + result = EOF; + } + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; + + /* Verify status? */ + return 0; +} + + +/* + * Execute specified statement on specified Data nodes, combine responses and + * send results back to the client + */ +int +DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot, + bool force_autocommit, List *simple_aggregates, bool is_read_only) +{ + int i; + int j; + int conn_count = list_length(nodelist) == 0 ? NumDataNodes : list_length(nodelist); + struct timeval *timeout = NULL; /* wait forever */ + ResponseCombiner combiner; + int res; + int newCount = 0; + bool need_tran; + bool found; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + DataNodeHandle *newConnections[conn_count]; + DataNodeHandle **connections; + + if (conn_count == 0) + return EOF; + + connections = get_handles(nodelist); + if (!connections) + return EOF; + + if (force_autocommit) + need_tran = false; + else + need_tran = !autocommit || conn_count > 1; + + elog(DEBUG1, "autocommit = %s, conn_count = %d, need_tran = %s", autocommit ? "true" : "false", conn_count, need_tran ? "true" : "false"); + + stat_statement(); + if (autocommit) + stat_transaction(conn_count); + + /* We normally clear for transactions, but if autocommit, clear here, too */ + if (autocommit == true) + { + clear_write_node_list(); + } + + /* Check status of connections */ + + /* + * We want to track new "write" nodes, and new nodes in the current + * transaction whether or not they are write nodes. + */ + if (!is_read_only && write_node_count < NumDataNodes) + { + for (i = 0; i < conn_count; i++) + { + found = false; + for (j = 0; j < write_node_count && !found; j++) + { + if (write_node_list[j] == connections[i]) + found = true; + } + if (!found) + { + /* Add to transaction wide-list */ + write_node_list[write_node_count++] = connections[i]; + /* Add to current statement list */ + newConnections[newCount++] = connections[i]; + } + } + /* Check connection state is DN_CONNECTION_STATE_IDLE */ + } + + gxid = GetCurrentGlobalTransactionId(); + + if (!GlobalTransactionIdIsValid(gxid)) + { + pfree(connections); + return EOF; + } + if (newCount > 0 && need_tran) + { + combiner = CreateResponseCombiner(newCount, COMBINE_TYPE_NONE, DestNone); + + /* Start transaction on connections where it is not started */ + res = data_node_begin(newCount, newConnections, combiner, gxid); + if (!ValidateAndCloseCombiner(combiner) || res) + { + pfree(connections); + return EOF; + } + } + + /* Send query to nodes */ + for (i = 0; i < conn_count; i++) + { + /* If explicit transaction is needed gxid is already sent */ + if (!need_tran && data_node_send_gxid(connections[i], gxid)) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + return EOF; + } + if (snapshot && data_node_send_snapshot(connections[i], snapshot)) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + return EOF; + } + if (data_node_send_query(connections[i], query) != 0) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + return EOF; + } + } + + combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM, dest); + AssignCombinerAggregates(combiner, simple_aggregates); + + /* Receive responses */ + res = data_node_receive_responses(conn_count, connections, timeout, combiner); + if (!ValidateAndCloseCombiner(combiner) || res) + { + if (autocommit) + { + if (need_tran) + DataNodeRollback(DestNone); + else if (!PersistentConnections) + release_handles(); + } + + pfree(connections); + return EOF; + } + + if (autocommit) + { + if (need_tran) + DataNodeCommit(DestNone); /* PGXCTODO - call CommitTransaction() + * instead? */ + else if (!PersistentConnections) + release_handles(); + } + + /* Verify status? */ + pfree(connections); + return 0; +} + + +/* + * Ensure specified amount of data can fit to the incoming buffer and + * increase it if necessary + */ +static int +ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle) +{ + int newsize = handle->inSize; + char *newbuf; + + if (bytes_needed <= (size_t) newsize) + return 0; + + do + { + newsize *= 2; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = repalloc(handle->inBuffer, newsize); + if (newbuf) + { + /* repalloc succeeded */ + handle->inBuffer = newbuf; + handle->inSize = newsize; + return 0; + } + } + + newsize = handle->inSize; + do + { + newsize += 8192; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = repalloc(handle->inBuffer, newsize); + if (newbuf) + { + /* repalloc succeeded */ + handle->inBuffer = newbuf; + handle->inSize = newsize; + return 0; + } + } + + return EOF; +} + + +/* + * Ensure specified amount of data can fit to the outgoing buffer and + * increase it if necessary + */ +static int +ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle) +{ + int newsize = handle->outSize; + char *newbuf; + + if (bytes_needed <= (size_t) newsize) + return 0; + + do + { + newsize *= 2; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = repalloc(handle->outBuffer, newsize); + if (newbuf) + { + /* repalloc succeeded */ + handle->outBuffer = newbuf; + handle->outSize = newsize; + return 0; + } + } + + newsize = handle->outSize; + do + { + newsize += 8192; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = repalloc(handle->outBuffer, newsize); + if (newbuf) + { + /* repalloc succeeded */ + handle->outBuffer = newbuf; + handle->outSize = newsize; + return 0; + } + } + + return EOF; +} + + +/* + * Send specified amount of data from the outgoing buffer over the connection + */ +static int +send_some(DataNodeHandle * handle, int len) +{ + char *ptr = handle->outBuffer; + int remaining = handle->outEnd; + int result = 0; + + /* while there's still data to send */ + while (len > 0) + { + int sent; + +#ifndef WIN32 + sent = send(handle->sock, ptr, len, 0); +#else + /* + * Windows can fail on large sends, per KB article Q201213. The failure-point + * appears to be different in different versions of Windows, but 64k should + * always be safe. + */ + sent = send(handle->sock, ptr, Min(len, 65536), 0); +#endif + + if (sent < 0) + { + /* + * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's + * EPIPE or ECONNRESET, assume we've lost the backend connection + * permanently. + */ + switch (errno) + { +#ifdef EAGAIN + case EAGAIN: + break; +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + case EWOULDBLOCK: + break; +#endif + case EINTR: + continue; + + case EPIPE: +#ifdef ECONNRESET + case ECONNRESET: +#endif + add_error_message(handle, "server closed the connection unexpectedly\n" + "\tThis probably means the server terminated abnormally\n" + "\tbefore or while processing the request.\n"); + + /* + * We used to close the socket here, but that's a bad idea + * since there might be unread data waiting (typically, a + * NOTICE message from the backend telling us it's + * committing hara-kiri...). Leave the socket open until + * pqReadData finds no more data can be read. But abandon + * attempt to send data. + */ + handle->outEnd = 0; + return -1; + + default: + add_error_message(handle, "could not send data to server"); + /* We don't assume it's a fatal error... */ + handle->outEnd = 0; + return -1; + } + } + else + { + ptr += sent; + len -= sent; + remaining -= sent; + } + + if (len > 0) + { + /* + * We did not send it all + * return 1 to indicate that data is still pending. + */ + result = 1; + break; + } + } + + /* shift the remaining contents of the buffer */ + if (remaining > 0) + memmove(handle->outBuffer, ptr, remaining); + handle->outEnd = remaining; + + return result; +} + + +/* + * Send specified statement down to the Data node + */ +static int +data_node_send_query(DataNodeHandle * handle, const char *query) +{ + int strLen = strlen(query) + 1; + + /* size + strlen */ + int msgLen = 4 + strLen; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'Q'; + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + memcpy(handle->outBuffer + handle->outEnd, query, strLen); + handle->outEnd += strLen; + + /* We need response right away, so send immediately */ + if (send_some(handle, handle->outEnd) < 0) + return EOF; + + handle->state = DN_CONNECTION_STATE_BUSY; + + return 0; +} + + +/* + * Send the GXID down to the Data node + */ +static int +data_node_send_gxid(DataNodeHandle * handle, GlobalTransactionId gxid) +{ + int msglen = 8; + int i32; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'g'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + i32 = htonl(gxid); + memcpy(handle->outBuffer + handle->outEnd, &i32, 4); + handle->outEnd += 4; + + return 0; +} + + +/* + * Send the snapshot down to the Data node + */ +static int +data_node_send_snapshot(DataNodeHandle * handle, Snapshot snapshot) +{ + int msglen; + int nval; + int i; + + /* calculate message length */ + msglen = 20; + if (snapshot->xcnt > 0) + msglen += snapshot->xcnt * 4; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 's'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + nval = htonl(snapshot->xmin); + memcpy(handle->outBuffer + handle->outEnd, &nval, 4); + handle->outEnd += 4; + + nval = htonl(snapshot->xmax); + memcpy(handle->outBuffer + handle->outEnd, &nval, 4); + handle->outEnd += 4; + + nval = htonl(snapshot->recent_global_xmin); + memcpy(handle->outBuffer + handle->outEnd, &nval, 4); + handle->outEnd += 4; + + nval = htonl(snapshot->xcnt); + memcpy(handle->outBuffer + handle->outEnd, &nval, 4); + handle->outEnd += 4; + + for (i = 0; i < snapshot->xcnt; i++) + { + nval = htonl(snapshot->xip[i]); + memcpy(handle->outBuffer + handle->outEnd, &nval, 4); + handle->outEnd += 4; + } + + return 0; +} + +/* + * Add another message to the list of errors to be returned back to the client + * at the convenient time + */ +static void +add_error_message(DataNodeHandle * handle, const char *message) +{ + handle->transaction_status = 'E'; + handle->state = DN_CONNECTION_STATE_ERROR; + if (handle->error) + { + /* PGXCTODO append */ + } + else + { + handle->error = pstrdup(message); + } +} + +/* + * for specified list return array of DataNodeHandles + * acquire from pool if needed. + * the lenth of returned array is the same as of nodelist + * Special case is empty or NIL nodeList, in this case return all the nodes. + * The returned list should be pfree'd when no longer needed. + */ +static DataNodeHandle ** +get_handles(List *nodelist) +{ + DataNodeHandle **result; + ListCell *node_list_item; + List *allocate = NIL; + + /* index of the result array */ + int i = 0; + + /* If node list is empty execute request on current nodes */ + if (list_length(nodelist) == 0) + { + /* + * We do not have to zero the array - on success all items will be set + * to correct pointers, on error the array will be freed + */ + result = (DataNodeHandle **) palloc(NumDataNodes * sizeof(DataNodeHandle *)); + if (!result) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + for (i = 0; i < NumDataNodes; i++) + { + result[i] = &handles[i]; + if (handles[i].sock == NO_SOCKET) + allocate = lappend_int(allocate, i + 1); + } + } + else + { + /* + * We do not have to zero the array - on success all items will be set + * to correct pointers, on error the array will be freed + */ + result = (DataNodeHandle **) palloc(list_length(nodelist) * sizeof(DataNodeHandle *)); + if (!result) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + i = 0; + foreach(node_list_item, nodelist) + { + int node = node_list_item->data.int_value; + + if (node > NumDataNodes || node <= 0) + elog(ERROR, "Node number: %d passed is not a known node", node); + result[i++] = &handles[node - 1]; + if (handles[node - 1].sock == NO_SOCKET) + allocate = lappend_int(allocate, node); + } + } + + if (allocate) + { + int j = 0; + int *fds = PoolManagerGetConnections(allocate); + + if (!fds) + { + pfree(result); + list_free(allocate); + return NULL; + } + foreach(node_list_item, allocate) + { + int node = node_list_item->data.int_value; + int fdsock = fds[j++]; + + data_node_init(&handles[node - 1], fdsock); + node_count++; + } + pfree(fds); + list_free(allocate); + } + + return result; +} + + +/* + * Return handles involved into current transaction, to run commit or rollback + * on them, as requested. + * Transaction is not started on nodes when read-only statement is executed + * on it, so we do not have to commit or rollback on those nodes. + * Parameter should point to array able to store at least node_count pointers + * to a DataNodeHandle structure. + * The function returns number of pointers written to the connections array. + * Remaining items in the array, if any, will be kept unchanged + */ +static int +get_transaction_nodes(DataNodeHandle ** connections) +{ + int tran_count = 0; + int i; + + if (node_count) + { + for (i = 0; i < NumDataNodes; i++) + { + if (handles[i].sock != NO_SOCKET && handles[i].transaction_status != 'I') + connections[tran_count++] = &handles[i]; + } + } + + return tran_count; +} + + +/* + * Called when the backend is ending. + */ +void +DataNodeCleanAndRelease(int code, Datum arg) +{ + /* Rollback on Data Nodes */ + if (IsTransactionState()) + { + DataNodeRollback(DestNone); + + /* Rollback on GTM if transaction id opened. */ + RollbackTranGTM((GlobalTransactionId) GetCurrentTransactionIdIfAny()); + } + + /* Release data node connections */ + release_handles(); + + /* Close connection with GTM */ + CloseGTM(); + + /* Dump collected statistics to the log */ + stat_log(); +} diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c new file mode 100644 index 0000000000..03b785f954 --- /dev/null +++ b/src/backend/pgxc/pool/poolcomm.c @@ -0,0 +1,614 @@ +/*------------------------------------------------------------------------- + * + * poolcomm.c + * + * Communication functions between the pool manager and session + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + *------------------------------------------------------------------------- + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/un.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stddef.h> +#include "c.h" +#include "pgxc/poolcomm.h" +#include "utils/elog.h" +#include "miscadmin.h" + +static int pool_recvbuf(PoolPort * port); +static int pool_discardbytes(PoolPort * port, size_t len); + +#ifdef HAVE_UNIX_SOCKETS + +#define POOLER_UNIXSOCK_PATH(path, port, sockdir) \ + snprintf(path, sizeof(path), "%s/.s.PGPOOL.%d", \ + ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \ + DEFAULT_PGSOCKET_DIR, \ + (port)) + +static char sock_path[MAXPGPATH]; + +static int Lock_AF_UNIX(unsigned short port, const char *unixSocketName); +#endif + +/* + * Open server socket on specified port to accept connection from sessions + */ +int +pool_listen(unsigned short port, const char *unixSocketName) +{ + int fd, + len; + struct sockaddr_un unix_addr; + +#ifdef HAVE_UNIX_SOCKETS + if (Lock_AF_UNIX(port, unixSocketName) < 0) + return -1; + + /* create a Unix domain stream socket */ + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) + return -1; + + /* fill in socket address structure */ + memset(&unix_addr, 0, sizeof(unix_addr)); + unix_addr.sun_family = AF_UNIX; + strcpy(unix_addr.sun_path, sock_path); + len = sizeof(unix_addr.sun_family) + + strlen(unix_addr.sun_path) + 1; + + /* bind the name to the descriptor */ + if (bind(fd, (struct sockaddr *) & unix_addr, len) < 0) + return -1; + + /* tell kernel we're a server */ + if (listen(fd, 5) < 0) + return -1; + + return fd; +#else + /* TODO support for non-unix platform */ + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("pool manager only supports UNIX socket"))); + return -1; +#endif +} + +#ifdef HAVE_UNIX_SOCKETS +static int +Lock_AF_UNIX(unsigned short port, const char *unixSocketName) +{ + POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName); + + CreateSocketLockFile(sock_path, true); + + unlink(sock_path); + + return 0; +} +#endif + +/* + * Connect to pooler listening on specified port + */ +int +pool_connect(unsigned short port, const char *unixSocketName) +{ + int fd, + len; + struct sockaddr_un unix_addr; + +#ifdef HAVE_UNIX_SOCKETS + /* create a Unix domain stream socket */ + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) + return -1; + + /* fill socket address structure w/server's addr */ + POOLER_UNIXSOCK_PATH(sock_path, port, unixSocketName); + + memset(&unix_addr, 0, sizeof(unix_addr)); + unix_addr.sun_family = AF_UNIX; + strcpy(unix_addr.sun_path, sock_path); + len = sizeof(unix_addr.sun_family) + + strlen(unix_addr.sun_path) + 1; + + if (connect(fd, (struct sockaddr *) & unix_addr, len) < 0) + return -1; + + return fd; +#else + /* TODO support for non-unix platform */ + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("pool manager only supports UNIX socket"))); + return -1; +#endif +} + + +/* + * Get one byte from the buffer, read data from the connection if buffer is empty + */ +int +pool_getbyte(PoolPort * port) +{ + while (port->RecvPointer >= port->RecvLength) + { + if (pool_recvbuf(port)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + return (unsigned char) port->RecvBuffer[port->RecvPointer++]; +} + + +/* + * Get one byte from the buffer if it is not empty + */ +int +pool_pollbyte(PoolPort * port) +{ + if (port->RecvPointer >= port->RecvLength) + { + return EOF; /* Empty buffer */ + } + return (unsigned char) port->RecvBuffer[port->RecvPointer++]; +} + + +/* + * Read pooler protocol message from the buffer. + */ +int +pool_getmessage(PoolPort * port, StringInfo s, int maxlen) +{ + int32 len; + + resetStringInfo(s); + + /* Read message length word */ + if (pool_getbytes(port, (char *) &len, 4) == EOF) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF within message length word"))); + return EOF; + } + + len = ntohl(len); + + if (len < 4 || + (maxlen > 0 && len > maxlen)) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid message length"))); + return EOF; + } + + len -= 4; /* discount length itself */ + + if (len > 0) + { + /* + * Allocate space for message. If we run out of room (ridiculously + * large message), we will elog(ERROR) + */ + PG_TRY(); + { + enlargeStringInfo(s, len); + } + PG_CATCH(); + { + if (pool_discardbytes(port, len) == EOF) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete message from client"))); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* And grab the message */ + if (pool_getbytes(port, s->data, len) == EOF) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete message from client"))); + return EOF; + } + s->len = len; + /* Place a trailing null per StringInfo convention */ + s->data[len] = '\0'; + } + + return 0; +} + + +/* -------------------------------- + * pool_getbytes - get a known number of bytes from connection + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pool_getbytes(PoolPort * port, char *s, size_t len) +{ + size_t amount; + + while (len > 0) + { + while (port->RecvPointer >= port->RecvLength) + { + if (pool_recvbuf(port)) /* If nothing in buffer, then recv + * some */ + return EOF; /* Failed to recv data */ + } + amount = port->RecvLength - port->RecvPointer; + if (amount > len) + amount = len; + memcpy(s, port->RecvBuffer + port->RecvPointer, amount); + port->RecvPointer += amount; + s += amount; + len -= amount; + } + return 0; +} + + +/* -------------------------------- + * pool_discardbytes - discard a known number of bytes from connection + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +static int +pool_discardbytes(PoolPort * port, size_t len) +{ + size_t amount; + + while (len > 0) + { + while (port->RecvPointer >= port->RecvLength) + { + if (pool_recvbuf(port)) /* If nothing in buffer, then recv + * some */ + return EOF; /* Failed to recv data */ + } + amount = port->RecvLength - port->RecvPointer; + if (amount > len) + amount = len; + port->RecvPointer += amount; + len -= amount; + } + return 0; +} + + +/* -------------------------------- + * pool_recvbuf - load some bytes into the input buffer + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +static int +pool_recvbuf(PoolPort * port) +{ + if (port->RecvPointer > 0) + { + if (port->RecvLength > port->RecvPointer) + { + /* still some unread data, left-justify it in the buffer */ + memmove(port->RecvBuffer, port->RecvBuffer + port->RecvPointer, + port->RecvLength - port->RecvPointer); + port->RecvLength -= port->RecvPointer; + port->RecvPointer = 0; + } + else + port->RecvLength = port->RecvPointer = 0; + } + + /* Can fill buffer from PqRecvLength and upwards */ + for (;;) + { + int r; + + r = recv(Socket(*port), port->RecvBuffer + port->RecvLength, + POOL_BUFFER_SIZE - port->RecvLength, 0); + + if (r < 0) + { + if (errno == EINTR) + continue; /* Ok if interrupted */ + + /* + * Report broken connection + */ + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not receive data from client: %m"))); + return EOF; + } + if (r == 0) + { + /* + * EOF detected. We used to write a log message here, but it's + * better to expect the ultimate caller to do that. + */ + return EOF; + } + /* r contains number of bytes read, so just incr length */ + port->RecvLength += r; + return 0; + } +} + + +/* + * Put a known number of bytes into the connection buffer + */ +int +pool_putbytes(PoolPort * port, const char *s, size_t len) +{ + size_t amount; + + while (len > 0) + { + /* If buffer is full, then flush it out */ + if (port->SendPointer >= POOL_BUFFER_SIZE) + if (pool_flush(port)) + return EOF; + amount = POOL_BUFFER_SIZE - port->SendPointer; + if (amount > len) + amount = len; + memcpy(port->SendBuffer + port->SendPointer, s, amount); + port->SendPointer += amount; + s += amount; + len -= amount; + } + return 0; +} + + +/* -------------------------------- + * pool_flush - flush pending output + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pool_flush(PoolPort * port) +{ + static int last_reported_send_errno = 0; + + char *bufptr = port->SendBuffer; + char *bufend = port->SendBuffer + port->SendPointer; + + while (bufptr < bufend) + { + int r; + + r = send(Socket(*port), bufptr, bufend - bufptr, 0); + + if (r <= 0) + { + if (errno == EINTR) + continue; /* Ok if we were interrupted */ + + if (errno != last_reported_send_errno) + { + last_reported_send_errno = errno; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("could not send data to client: %m"))); + } + + /* + * We drop the buffered data anyway so that processing can + * continue, even though we'll probably quit soon. + */ + port->SendPointer = 0; + return EOF; + } + + last_reported_send_errno = 0; /* reset after any successful send */ + bufptr += r; + } + + port->SendPointer = 0; + return 0; +} + + +/* + * Put the pooler protocol message into the connection buffer + */ +int +pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len) +{ + uint n32; + + if (pool_putbytes(port, &msgtype, 1)) + return EOF; + + n32 = htonl((uint32) (len + 4)); + if (pool_putbytes(port, (char *) &n32, 4)) + return EOF; + + if (pool_putbytes(port, s, len)) + return EOF; + + return 0; +} + +/* message code('f'), size(8), node_count */ +#define SEND_MSG_BUFFER_SIZE 9 + + +/* + * Build up a message carrying file deskriptors and send them over specified + * connection + */ +int +pool_sendfds(PoolPort * port, int *fds, int count) +{ + struct iovec iov[1]; + struct msghdr msg; + char buf[SEND_MSG_BUFFER_SIZE]; + uint n32; + int controllen = sizeof(struct cmsghdr) + count * sizeof(int); + struct cmsghdr *cmptr = NULL; + + buf[0] = 'f'; + n32 = htonl((uint32) 8); + memcpy(buf + 1, &n32, 4); + n32 = htonl((uint32) count); + memcpy(buf + 5, &n32, 4); + + iov[0].iov_base = buf; + iov[0].iov_len = SEND_MSG_BUFFER_SIZE; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + if (count == 0) + { + msg.msg_control = NULL; + msg.msg_controllen = 0; + } + else + { + if ((cmptr = malloc(controllen)) == NULL) + return EOF; + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + cmptr->cmsg_len = controllen; + msg.msg_control = (caddr_t) cmptr; + msg.msg_controllen = controllen; + /* the fd to pass */ + memcpy(CMSG_DATA(cmptr), fds, count * sizeof(int)); + } + + if (sendmsg(Socket(*port), &msg, 0) != SEND_MSG_BUFFER_SIZE) + { + if (cmptr) + free(cmptr); + return EOF; + } + + if (cmptr) + free(cmptr); + + return 0; +} + + +/* + * Read a message from the specified connection carrying file descriptors + */ +int +pool_recvfds(PoolPort * port, int *fds, int count) +{ + int r; + uint n32; + char buf[SEND_MSG_BUFFER_SIZE]; + struct iovec iov[1]; + struct msghdr msg; + int controllen = sizeof(struct cmsghdr) + count * sizeof(int); + struct cmsghdr *cmptr = malloc(controllen); + + if (cmptr == NULL) + return EOF; + + iov[0].iov_base = buf; + iov[0].iov_len = SEND_MSG_BUFFER_SIZE; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = (caddr_t) cmptr; + msg.msg_controllen = controllen; + + r = recvmsg(Socket(*port), &msg, 0); + if (r < 0) + { + /* + * Report broken connection + */ + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("could not receive data from client: %m"))); + goto failure; + } + else if (r == 0) + { + goto failure; + } + else if (r != SEND_MSG_BUFFER_SIZE) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete message from client"))); + goto failure; + } + + /* Verify response */ + if (buf[0] != 'f') + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message code"))); + goto failure; + } + + memcpy(&n32, buf + 1, 4); + n32 = ntohl(n32); + if (n32 != 8) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid message size"))); + goto failure; + } + + /* + * If connection count is 0 it means pool does not have connections + * to fulfill request. Otherwise number of returned connections + * should be equal to requested count. If it not the case consider this + * a protocol violation. (Probably connection went out of sync) + */ + memcpy(&n32, buf + 5, 4); + n32 = ntohl(n32); + if (n32 == 0) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("failed to acquire connections"))); + goto failure; + } + + if (n32 != count) + { + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected connection count"))); + goto failure; + } + + memcpy(fds, CMSG_DATA(cmptr), count * sizeof(int)); + free(cmptr); + return 0; +failure: + free(cmptr); + return EOF; +} diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c new file mode 100644 index 0000000000..02e5ddd5cd --- /dev/null +++ b/src/backend/pgxc/pool/poolmgr.c @@ -0,0 +1,1403 @@ +/*------------------------------------------------------------------------- + * + * poolmgr.c + * + * Connection pool manager handles connections to DataNodes + * + * The pooler runs as a separate process and is forked off from a + * coordinator postmaster. If the coordinator needs a connection from a + * data node, it asks for one from the pooler, which maintains separate + * pools for each data node. A group of connections can be requested in + * a single request, and the pooler returns a list of file descriptors + * to use for the connections. + * + * Note the current implementation does not yet shrink the pool over time + * as connections are idle. Also, it does not queue requests; if a + * connection is unavailable, it will simply fail. This should be implemented + * one day, although there is a chance for deadlocks. For now, limiting + * connections should be done between the application and coordinator. + * Still, this is useful to avoid having to re-establish connections to the + * data nodes all the time for multiple coordinator backend sessions. + * + * The term "agent" here refers to a session manager, one for each backend + * coordinator connection to the pooler. It will contain a list of connections + * allocated to a session, at most one per data node. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <signal.h> +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgxc/poolmgr.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "pgxc/locator.h" +#include "../interfaces/libpq/libpq-fe.h" +#include "postmaster/postmaster.h" /* For UnixSocketDir */ +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> + +/* Configuration options */ +int NumDataNodes = 2; +int MinPoolSize = 1; +int MaxPoolSize = 100; +int PoolerPort = 6667; + +bool PersistentConnections = false; + +/* The memory context */ +static MemoryContext PoolerMemoryContext = NULL; + +/* Connection info */ +char *DataNodeHosts = NULL; +char *DataNodePorts = NULL; +char *DataNodeUsers = NULL; +char *DataNodePwds = NULL; + +/* Connection info list */ +static DataNodeConnectionInfo *connectionInfos; + +/* Pool to all the databases (linked list) */ +static DatabasePool *databasePools = NULL; + +/* PoolAgents */ +static int agentCount = 0; +static PoolAgent **poolAgents; + +static PoolHandle *Handle = NULL; + +static int server_fd = -1; + +static void agent_init(PoolAgent * agent, const char *database, List *nodes); +static void agent_destroy(PoolAgent * agent); +static void agent_create(void); +static void agent_handle_input(PoolAgent * agent, StringInfo s); +static DatabasePool *create_database_pool(const char *database, List *nodes); +static void insert_database_pool(DatabasePool * pool); +static int destroy_database_pool(const char *database); +static DatabasePool *find_database_pool(const char *database); +static DatabasePool *remove_database_pool(const char *database); +static int *agent_acquire_connections(PoolAgent * agent, List *nodelist); +static DataNodePoolSlot *acquire_connection(DatabasePool * dbPool, int node); +static void agent_release_connections(PoolAgent * agent, bool clean); +static void release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean); +static void destroy_slot(DataNodePoolSlot * slot); +static void grow_pool(DatabasePool * dbPool, int index); +static void destroy_node_pool(DataNodePool * node_pool); +static void PoolerLoop(void); + +/* Signal handlers */ +static void pooler_die(SIGNAL_ARGS); +static void pooler_quickdie(SIGNAL_ARGS); + +/* Check status of connection */ +extern int pqReadReady(PGconn * conn); + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t shutdown_requested = false; + + +/* + * Initialize internal structures + */ +int +PoolManagerInit() +{ + char *rawstring; + List *elemlist; + ListCell *l; + int i; + MemoryContext old_context; + + elog(DEBUG1, "Pooler process is started: %d", getpid()); + + /* + * Set up memory context for the pooler + */ + PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext, + "PoolerMemoryContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + /* + * If possible, make this process a group leader, so that the postmaster + * can signal any child processes too. (pool manager probably never has any + * child processes, but for consistency we make all postmaster child + * processes do this.) + */ +#ifdef HAVE_SETSID + if (setsid() < 0) + elog(FATAL, "setsid() failed: %m"); +#endif + /* + * Properly accept or ignore signals the postmaster might send us + */ + pqsignal(SIGINT, pooler_die); + pqsignal(SIGTERM, pooler_die); + pqsignal(SIGQUIT, pooler_quickdie); + pqsignal(SIGHUP, SIG_IGN); + /* TODO other signal handlers */ + + /* We allow SIGQUIT (quickdie) at all times */ +#ifdef HAVE_SIGPROCMASK + sigdelset(&BlockSig, SIGQUIT); +#else + BlockSig &= ~(sigmask(SIGQUIT)); +#endif + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* Allocate pooler structures in the Pooler context */ + old_context = MemoryContextSwitchTo(PoolerMemoryContext); + + poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *)); + if (poolAgents == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + connectionInfos = (DataNodeConnectionInfo *) palloc(NumDataNodes * sizeof(DataNodeConnectionInfo)); + if (connectionInfos == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* Need a modifiable copy */ + rawstring = pstrdup(DataNodeHosts); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_hosts\""))); + } + + i = 0; + foreach(l, elemlist) + { + char *curhost = (char *) lfirst(l); + + connectionInfos[i].host = pstrdup(curhost); + if (connectionInfos[i].host == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Ignore extra entries, if any */ + if (++i == NumDataNodes) + break; + } + list_free(elemlist); + pfree(rawstring); + + /* Validate */ + if (i == 0) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_hosts\""))); + } + else if (i == 1) + { + /* Copy all values from first */ + for (; i < NumDataNodes; i++) + { + connectionInfos[i].host = pstrdup(connectionInfos[0].host); + if (connectionInfos[i].host == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + } + } + else if (i < NumDataNodes) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_hosts\""))); + } + + /* Need a modifiable copy */ + rawstring = pstrdup(DataNodePorts); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_ports\""))); + } + + i = 0; + foreach(l, elemlist) + { + char *curport = (char *) lfirst(l); + + connectionInfos[i].port = pstrdup(curport); + if (connectionInfos[i].port == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Ignore extra entries, if any */ + if (++i == NumDataNodes) + break; + } + list_free(elemlist); + pfree(rawstring); + + /* Validate */ + if (i == 0) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_ports\""))); + } + else if (i == 1) + { + /* Copy all values from first */ + for (; i < NumDataNodes; i++) + { + connectionInfos[i].port = pstrdup(connectionInfos[0].port); + if (connectionInfos[i].port == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + } + } + else if (i < NumDataNodes) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_ports\""))); + } + + rawstring = pstrdup(DataNodeUsers); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_users\""))); + } + + i = 0; + foreach(l, elemlist) + { + char *curuser = (char *) lfirst(l); + + connectionInfos[i].uname = pstrdup(curuser); + if (connectionInfos[i].uname == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Ignore extra entries, if any */ + if (++i == NumDataNodes) + break; + } + list_free(elemlist); + pfree(rawstring); + + /* Validate */ + if (i == 0) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_users\""))); + } + else if (i == 1) + { + /* Copy all values from first */ + for (; i < NumDataNodes; i++) + { + connectionInfos[i].uname = pstrdup(connectionInfos[0].uname); + if (connectionInfos[i].uname == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + } + } + else if (i < NumDataNodes) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_users\""))); + } + + rawstring = pstrdup(DataNodePwds); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_passwords\""))); + } + + i = 0; + foreach(l, elemlist) + { + char *curpassword = (char *) lfirst(l); + + connectionInfos[i].password = pstrdup(curpassword); + if (connectionInfos[i].password == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Ignore extra entries, if any */ + if (++i == NumDataNodes) + break; + } + list_free(elemlist); + pfree(rawstring); + + /* Validate */ + if (i == 0) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_passwords\""))); + } + else if (i == 1) + { + /* Copy all values from first */ + for (; i < NumDataNodes; i++) + { + connectionInfos[i].password = pstrdup(connectionInfos[0].password); + if (connectionInfos[i].password == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + } + } + else if (i < NumDataNodes) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"data_node_passwords\""))); + } + + PoolerLoop(); + return 0; +} + + +/* + * Destroy internal structures + */ +int +PoolManagerDestroy(void) +{ + int status = 0; + + if (PoolerMemoryContext) + { + MemoryContextDelete(PoolerMemoryContext); + PoolerMemoryContext = NULL; + } + + return status; +} + + +/* + * Get handle to pool manager + * Invoked from Postmaster's main loop just before forking off new session + * Returned PoolHandle structure will be inherited by session process + */ +PoolHandle * +GetPoolManagerHandle(void) +{ + PoolHandle *handle; + int fdsock; + + /* Connect to the pooler */ + fdsock = pool_connect(PoolerPort, UnixSocketDir); + if (fdsock < 0) + { + int saved_errno = errno; + + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("failed to connect to pool manager: %m"))); + errno = saved_errno; + return NULL; + } + + /* Allocate handle */ + /* + * XXX we may change malloc here to palloc but first ensure + * the CurrentMemoryContext is properly set. + * The handle allocated just before new session is forked off and + * inherited by the session process. It should remain valid for all + * the session lifetime. + */ + handle = (PoolHandle *) malloc(sizeof(PoolHandle)); + if (!handle) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return NULL; + } + + handle->port.fdsock = fdsock; + handle->port.RecvLength = 0; + handle->port.RecvPointer = 0; + handle->port.SendPointer = 0; + + return handle; +} + + +/* + * Close handle + */ +void +PoolManagerCloseHandle(PoolHandle * handle) +{ + close(Socket(handle->port)); + free(handle); +} + + +/* + * Create agent + */ +static void +agent_create(void) +{ + int new_fd; + PoolAgent *agent; + + new_fd = accept(server_fd, NULL, NULL); + if (new_fd < 0) + { + int saved_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("pool manager failed to accept connection: %m"))); + errno = saved_errno; + return; + } + + /* Allocate agent */ + agent = (PoolAgent *) palloc(sizeof(PoolAgent)); + if (!agent) + { + close(new_fd); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return; + } + + agent->port.fdsock = new_fd; + agent->port.RecvLength = 0; + agent->port.RecvPointer = 0; + agent->port.SendPointer = 0; + agent->pool = NULL; + agent->connections = NULL; + + /* Append new agent to the list */ + poolAgents[agentCount++] = agent; +} + + +/* + * Associate session with specified database and respective connection pool + * Invoked from Session process + */ +void +PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes) +{ + Assert(Handle); + Assert(database); + + /* Save the handle */ + Handle = handle; + + /* Send database name followed by \0 terminator */ + pool_putmessage(&handle->port, 'c', database, strlen(database) + 1); + pool_flush(&handle->port); +} + + +/* + * Init PoolAgent +*/ +static void +agent_init(PoolAgent * agent, const char *database, List *nodes) +{ + Assert(agent); + Assert(database); + Assert(list_length(nodes) > 0); + + /* disconnect if we still connected */ + if (agent->pool) + agent_release_connections(agent, false); + + /* find database */ + agent->pool = find_database_pool(database); + + /* create if not found */ + if (agent->pool == NULL) + agent->pool = create_database_pool(database, nodes); +} + + +/* + * Destroy PoolAgent + */ +static void +agent_destroy(PoolAgent * agent) +{ + int i; + + Assert(agent); + + close(Socket(agent->port)); + + /* Discard connections if any remaining */ + if (agent->pool) + agent_release_connections(agent, false); + + /* find agent in the list */ + for (i = 0; i < agentCount; i++) + { + if (poolAgents[i] == agent) + { + /* free memory */ + if (agent->connections) + { + pfree(agent->connections); + agent->connections = NULL; + } + pfree(agent); + /* shrink the list and move last agent into the freed slot */ + if (i < --agentCount) + poolAgents[i] = poolAgents[agentCount]; + /* only one match is expected so exit */ + break; + } + } +} + + +/* + * Release handle to pool manager + */ +void +PoolManagerDisconnect(PoolHandle * handle) +{ + Assert(handle); + + pool_putmessage(&handle->port, 'd', NULL, 0); + pool_flush(&Handle->port); + + close(Socket(handle->port)); + + pfree(handle); +} + + +/* + * Get pooled connections + */ +int * +PoolManagerGetConnections(List *nodelist) +{ + int i; + ListCell *nodelist_item; + int *fds; + int nodes[list_length(nodelist) + 1]; + + Assert(Handle); + Assert(list_length(nodelist) > 0); + + /* Prepare end send message to pool manager */ + nodes[0] = htonl(list_length(nodelist)); + i = 1; + foreach(nodelist_item, nodelist) + { + nodes[i++] = htonl(nodelist_item->data.int_value); + } + pool_putmessage(&Handle->port, 'g', (char *) nodes, sizeof(int) * (list_length(nodelist) + 1)); + pool_flush(&Handle->port); + /* Receive response */ + fds = (int *) palloc(sizeof(int) * list_length(nodelist)); + if (fds == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + if (pool_recvfds(&Handle->port, fds, list_length(nodelist))) + { + pfree(fds); + return NULL; + } + return fds; +} + + +/* + * Handle messages to agent + */ +static void +agent_handle_input(PoolAgent * agent, StringInfo s) +{ + int qtype; + const char *database; + int nodecount; + List *nodelist = NIL; + int *fds; + int i; + + qtype = pool_getbyte(&agent->port); + /* + * We can have multiple messages, so handle them all + */ + for (;;) + { + switch (qtype) + { + case 'c': /* CONNECT */ + pool_getmessage(&agent->port, s, 0); + database = pq_getmsgstring(s); + agent_init(agent, database, GetAllNodes()); + pq_getmsgend(s); + break; + case 'd': /* DISCONNECT */ + pool_getmessage(&agent->port, s, 4); + agent_destroy(agent); + pq_getmsgend(s); + break; + case 'g': /* GET CONNECTIONS */ + pool_getmessage(&agent->port, s, 4 * NumDataNodes + 8); + nodecount = pq_getmsgint(s, 4); + for (i = 0; i < nodecount; i++) + { + nodelist = lappend_int(nodelist, pq_getmsgint(s, 4)); + } + pq_getmsgend(s); + /* + * In case of error agent_acquire_connections will log + * the error and return NULL + */ + fds = agent_acquire_connections(agent, nodelist); + list_free(nodelist); + pool_sendfds(&agent->port, fds, fds ? nodecount : 0); + if (fds) + pfree(fds); + break; + case 'r': /* RELEASE CONNECTIONS */ + pool_getmessage(&agent->port, s, 4); + pq_getmsgend(s); + agent_release_connections(agent, true); + break; + default: /* EOF or protocol violation */ + agent_destroy(agent); + return; + } + /* avoid reading from connection */ + if ((qtype = pool_pollbyte(&agent->port)) == EOF) + break; + } +} + + +/* + * acquire connection + */ +static int * +agent_acquire_connections(PoolAgent * agent, List *nodelist) +{ + int i; + int *result; + ListCell *nodelist_item; + + Assert(agent); + Assert(nodelist); + + /* Allocate memory */ + result = (int *) palloc(list_length(nodelist) * sizeof(int)); + if (result == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* initialize connection if it is not initialized yet */ + if (!agent->connections) + { + agent->connections = (DataNodePoolSlot **) palloc(NumDataNodes * sizeof(DataNodePoolSlot *)); + if (!agent->connections) + { + pfree(result); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return NULL; + } + + for (i = 0; i < NumDataNodes; i++) + agent->connections[i] = NULL; + } + + /* Initialize result */ + i = 0; + foreach(nodelist_item, nodelist) + { + int node = nodelist_item->data.int_value; + + /* Acquire from the pool if none */ + if (agent->connections[node - 1] == NULL) + { + DataNodePoolSlot *slot = acquire_connection(agent->pool, node); + + /* Handle failure */ + if (slot == NULL) + { + pfree(result); + return NULL; + } + + /* Store in the descriptor */ + agent->connections[node - 1] = slot; + } + + result[i++] = PQsocket((PGconn *) agent->connections[node - 1]->conn); + } + + return result; +} + + +/* + * Retun connections back to the pool + */ +void +PoolManagerReleaseConnections() +{ + Assert(Handle); + + pool_putmessage(&Handle->port, 'r', NULL, 0); + pool_flush(&Handle->port); +} + + +/* + * Release connections + */ +static void +agent_release_connections(PoolAgent * agent, bool clean) +{ + int i; + + if (!agent->connections) + return; + + /* Enumerate connections */ + for (i = 0; i < NumDataNodes; i++) + { + DataNodePoolSlot *slot; + + slot = agent->connections[i]; + + /* Release connection */ + if (slot) + { + release_connection(agent->pool, slot, i, clean); + } + agent->connections[i] = NULL; + } +} + + +/* + * Create new empty pool for a database and insert into the list + * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory + * error and POOL_WEXIST if poll for this database already exist + */ +static DatabasePool * +create_database_pool(const char *database, List *nodes) +{ + DatabasePool *databasePool; + int i; + ListCell *l; + + Assert(nodes && nodes->length > 0); + + /* check if exist */ + databasePool = find_database_pool(database); + if (databasePool) + { + /* already exist */ + return databasePool; + } + + /* Allocate memory */ + databasePool = (DatabasePool *) palloc(sizeof(DatabasePool)); + if (!databasePool) + { + /* out of memory */ + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return NULL; + } + + /* Copy the database name */ ; + databasePool->database = pstrdup(database); + if (!databasePool->database) + { + /* out of memory */ + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + pfree(databasePool); + return NULL; + } + + /* Init next reference */ + databasePool->next = NULL; + + /* Init data node pools */ + databasePool->nodePools = (DataNodePool **) palloc(NumDataNodes * sizeof(DataNodePool **)); + if (!databasePool->nodePools) + { + /* out of memory */ + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + pfree(databasePool->database); + pfree(databasePool); + return NULL; + } + for (i = 0; i < NumDataNodes; i++) + databasePool->nodePools[i] = NULL; + + foreach(l, nodes) + { + int nodeid = lfirst_int(l); + + grow_pool(databasePool, nodeid - 1); + } + + /* Insert into the list */ + insert_database_pool(databasePool); + + return databasePool; +} + + +/* + * Destroy the pool and free memory + */ +static int +destroy_database_pool(const char *database) +{ + DatabasePool *databasePool; + int i; + + /* Delete from the list */ + databasePool = remove_database_pool(database); + if (databasePool) + { + if (databasePool->nodePools) + { + for (i = 0; i < NumDataNodes; i++) + if (databasePool->nodePools[i]) + destroy_node_pool(databasePool->nodePools[i]); + pfree(databasePool->nodePools); + } + /* free allocated memory */ + pfree(databasePool->database); + pfree(databasePool); + return 1; + } + return 0; +} + + +/* + * Insert new database pool to the list + */ +static void +insert_database_pool(DatabasePool * databasePool) +{ + Assert(databasePool); + + /* Reference existing list or null the tail */ + if (databasePools) + databasePool->next = databasePools; + else + databasePool->next = NULL; + + /* Update head pointer */ + databasePools = databasePool; +} + + +/* + * Find pool for specified database in the list + */ +static DatabasePool +* +find_database_pool(const char *database) +{ + DatabasePool *databasePool; + + /* Scan the list */ + databasePool = databasePools; + while (databasePool) + { + + /* if match break the loop and return */ + if (strcmp(database, databasePool->database) == 0) + break; + databasePool = databasePool->next; + + } + return databasePool; +} + + +/* + * Remove pool for specified database from the list + */ +static DatabasePool +* +remove_database_pool(const char *database) +{ + DatabasePool *databasePool, + *prev; + + /* Scan the list */ + databasePool = databasePools; + prev = NULL; + while (databasePool) + { + + /* if match break the loop and return */ + if (strcmp(database, databasePool->database) == 0) + break; + prev = databasePool; + databasePool = databasePool->next; + } + + /* if found */ + if (databasePool) + { + + /* Remove entry from chain or update head */ + if (prev) + prev->next = databasePool->next; + else + databasePools = databasePool->next; + + + databasePool->next = NULL; + } + return databasePool; +} + +/* + * Acquire connection + */ +static DataNodePoolSlot * +acquire_connection(DatabasePool * dbPool, int node) +{ + DataNodePool *nodePool; + DataNodePoolSlot *slot; + + Assert(dbPool); + Assert(0 <= node && node < NumDataNodes); + + slot = NULL; + /* Find referenced node pool */ + nodePool = dbPool->nodePools[node - 1]; + if (nodePool == NULL || nodePool->freeSize == 0) + { + grow_pool(dbPool, node - 1); + nodePool = dbPool->nodePools[node - 1]; + } + + /* Check available connections */ + if (nodePool && nodePool->freeSize > 0) + { + int poll_result; + + while (nodePool->freeSize > 0) + { + slot = nodePool->slot[--(nodePool->freeSize)]; + + retry: + /* Make sure connection is ok */ + poll_result = pqReadReady(slot->conn); + + if (poll_result == 0) + { + /* ok, no data */ + break; + } + else if (poll_result < 0) + { + if (errno == EAGAIN || errno == EINTR) + goto retry; + + elog(WARNING, "Error in checking connection, errno = %d", errno); + } + else + { + elog(WARNING, "Unexpected data on connection, cleaning."); + } + + destroy_slot(slot); + /* Decrement current max pool size */ + (nodePool->size)--; + /* Ensure we are not below minimum size */ + grow_pool(dbPool, node - 1); + } + } + else + { + /* report problem */ + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("connection pool is empty"))); + } + return slot; +} + + +/* + * release connection from specified pool and slot + */ +static void +release_connection(DatabasePool * dbPool, DataNodePoolSlot * slot, int index, bool clean) +{ + DataNodePool *nodePool; + + Assert(dbPool); + Assert(slot); + Assert(0 <= index && index < NumDataNodes); + + /* Find referenced node pool */ + nodePool = dbPool->nodePools[index]; + if (nodePool == NULL) + { + /* report problem */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("database does not use node %d", (index + 1)))); + return; + } + + /* return or discard */ + if (clean) + { + /* Insert the slot into the array and increase pool size */ + nodePool->slot[(nodePool->freeSize)++] = slot; + } + else + { + elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr); + destroy_slot(slot); + /* Decrement pool size */ + (nodePool->size)--; + /* Ensure we are not below minimum size */ + grow_pool(dbPool, index); + } +} + + +/* + * Increase database pool size + */ +static void +grow_pool(DatabasePool * dbPool, int index) +{ + DataNodePool *nodePool; + + Assert(dbPool); + Assert(0 <= index && index < NumDataNodes); + + /* Find referenced node pool */ + nodePool = dbPool->nodePools[index]; + if (!nodePool) + { + /* Allocate new DBNode Pool */ + nodePool = (DataNodePool *) palloc(sizeof(DataNodePool)); + if (!nodePool) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* initialize it */ + nodePool->connstr = DataNodeConnStr( + connectionInfos[index].host, + connectionInfos[index].port, + dbPool->database, + connectionInfos[index].uname, + connectionInfos[index].password); + + if (!nodePool->connstr) + { + pfree(nodePool); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + nodePool->slot = (DataNodePoolSlot **) palloc(MaxPoolSize * sizeof(DataNodePoolSlot *)); + if (!nodePool->slot) + { + pfree(nodePool); + pfree(nodePool->connstr); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + memset(nodePool->slot, 0, MaxPoolSize * sizeof(DataNodePoolSlot *)); + nodePool->freeSize = 0; + nodePool->size = 0; + + /* and insert into the array */ + dbPool->nodePools[index] = nodePool; + } + + while (nodePool->size < MinPoolSize || (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)) + { + DataNodePoolSlot *slot; + + /* Allocate new slot */ + slot = (DataNodePoolSlot *) palloc(sizeof(DataNodePoolSlot)); + if (slot == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* Establish connection */ + slot->conn = DataNodeConnect(nodePool->connstr); + if (!DataNodeConnected(slot->conn)) + { + destroy_slot(slot); + ereport(LOG, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("failed to connect to data node"))); + break; + } + + /* Insert at the end of the pool */ + nodePool->slot[(nodePool->freeSize)++] = slot; + + /* Increase count of pool size */ + (nodePool->size)++; + elog(DEBUG1, "Pooler: increased pool size to %d for pool %s", + nodePool->size, + nodePool->connstr); + } +} + + +/* + * Destroy pool slot + */ +static void +destroy_slot(DataNodePoolSlot * slot) +{ + DataNodeClose(slot->conn); + pfree(slot); +} + + +/* + * Destroy node pool + */ +static void +destroy_node_pool(DataNodePool * node_pool) +{ + int i; + + /* + * At this point all agents using connections from this pool should be already closed + * If this not the connections to the data nodes assigned to them remain open, this will + * consume data node resources. + * I believe this is not the case because pool is only destroyed on coordinator shutdown. + * However we should be careful when changing thinds + */ + elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use", + node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize); + if (node_pool->connstr) + pfree(node_pool->connstr); + + if (node_pool->slot) + { + for (i = 0; i < node_pool->freeSize; i++) + destroy_slot(node_pool->slot[i]); + pfree(node_pool->slot); + } +} + + +/* + * Main handling loop + */ +static void +PoolerLoop(void) +{ + StringInfoData input_message; + + server_fd = pool_listen(PoolerPort, UnixSocketDir); + if (server_fd == -1) + { + /* log error */ + return; + } + initStringInfo(&input_message); + for (;;) + { + int nfds; + fd_set rfds; + int retval; + int i; + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (!PostmasterIsAlive(true)) + exit(1); + + /* watch for incoming connections */ + FD_ZERO(&rfds); + FD_SET (server_fd, &rfds); + + nfds = server_fd; + + /* watch for incoming messages */ + for (i = 0; i < agentCount; i++) + { + PoolAgent *agent = poolAgents[i]; + int sockfd = Socket(agent->port); + FD_SET (sockfd, &rfds); + + nfds = Max(nfds, sockfd); + } + + /* wait for event */ + retval = select(nfds + 1, &rfds, NULL, NULL, NULL); + if (shutdown_requested) + { + for (i = agentCount - 1; i >= 0; i--) + { + PoolAgent *agent = poolAgents[i]; + + agent_destroy(agent); + } + while (databasePools) + if (destroy_database_pool(databasePools->database) == 0) + break; + close(server_fd); + exit(0); + } + if (retval > 0) + { + /* + * Agent may be removed from the array while processing + * and trailing items are shifted, so scroll downward + * to avoid problem + */ + for (i = agentCount - 1; i >= 0; i--) + { + PoolAgent *agent = poolAgents[i]; + int sockfd = Socket(agent->port); + + if (FD_ISSET(sockfd, &rfds)) + agent_handle_input(agent, &input_message); + } + if (FD_ISSET(server_fd, &rfds)) + agent_create(); + } + } +} + + +/* + * + */ +static void +pooler_die(SIGNAL_ARGS) +{ + shutdown_requested = true; +} + + +/* + * + */ +static void +pooler_quickdie(SIGNAL_ARGS) +{ + PG_SETMASK(&BlockSig); + exit(2); +} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 3dbf36a6cf..0dd252cb62 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -34,6 +34,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -102,6 +103,13 @@ #include "libpq/libpq.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +/* COORD */ +#include "pgxc/locator.h" +#include "pgxc/poolmgr.h" +#include "access/gtm.h" +#endif #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/fork_process.h" @@ -204,6 +212,9 @@ char *bonjour_name; /* PIDs of special child processes; 0 when not running */ static pid_t StartupPID = 0, +#ifdef PGXC /* PGXC_COORD */ + PgPoolerPID = 0, +#endif /* PGXC_COORD */ BgWriterPID = 0, WalWriterPID = 0, AutoVacPID = 0, @@ -442,6 +453,12 @@ static void ShmemBackendArrayAdd(Backend *bn); static void ShmemBackendArrayRemove(Backend *bn); #endif /* EXEC_BACKEND */ +#ifdef PGXC /* PGXC_COORD */ +bool isPGXCCoordinator = false; +bool isPGXCDataNode = false; +#define StartPoolManager() StartChildProcess(PoolerProcess) +#endif + #define StartupDataBase() StartChildProcess(StartupProcess) #define StartBackgroundWriter() StartChildProcess(BgWriterProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) @@ -461,6 +478,9 @@ PostmasterMain(int argc, char *argv[]) int status; char *userDoption = NULL; int i; +#ifdef PGXC /* PGXC_COORD */ + MemoryContext oldcontext; +#endif MyProcPid = PostmasterPid = getpid(); @@ -506,7 +526,11 @@ PostmasterMain(int argc, char *argv[]) * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ +#ifdef PGXC + while ((opt = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:X-:")) != -1) +#else while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) +#endif { switch (opt) { @@ -517,6 +541,11 @@ PostmasterMain(int argc, char *argv[]) case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; +#ifdef PGXC + case 'C': + isPGXCCoordinator = true; + break; +#endif case 'D': userDoption = optarg; @@ -638,6 +667,11 @@ PostmasterMain(int argc, char *argv[]) SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; +#ifdef PGXC + case 'X': + isPGXCDataNode = true; + break; +#endif case 'c': case '-': { @@ -673,6 +707,14 @@ PostmasterMain(int argc, char *argv[]) } } +#ifdef PGXC + if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE) + { + write_stderr("%s: PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n", + progname); + ExitPostmaster(1); + } +#endif /* * Postmaster accepts no non-option switch arguments. */ @@ -1037,6 +1079,20 @@ PostmasterMain(int argc, char *argv[]) Assert(StartupPID != 0); pmState = PM_STARTUP; +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Initialize the Data Node connection pool + */ + PgPoolerPID = StartPoolManager(); + + MemoryContextSwitchTo(oldcontext); + } +#endif + status = ServerLoop(); /* @@ -1393,6 +1449,11 @@ ServerLoop(void) if (PgStatPID == 0 && pmState == PM_RUN) PgStatPID = pgstat_start(); +#ifdef PGXC /* PGXC_COORD */ + /* If we have lost the pooler, try to start a new one */ + if (IS_PGXC_COORDINATOR && PgPoolerPID == 0 && pmState == PM_RUN) + PgPoolerPID = StartPoolManager(); +#endif /* * Touch the socket and lock file every 58 minutes, to ensure that * they are not removed by overzealous /tmp-cleaning tasks. We assume @@ -1990,6 +2051,10 @@ SIGHUP_handler(SIGNAL_ARGS) SignalChildren(SIGHUP); if (StartupPID != 0) signal_child(StartupPID, SIGHUP); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGHUP); +#endif if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); if (WalWriterPID != 0) @@ -2062,6 +2127,11 @@ pmdie(SIGNAL_ARGS) /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); +#ifdef PGXC /* PGXC_COORD */ + /* and the pool manager too */ + if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGTERM); +#endif pmState = PM_WAIT_BACKUP; } @@ -2108,6 +2178,11 @@ pmdie(SIGNAL_ARGS) /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); +#ifdef PGXC /* PGXC_COORD */ + /* and the pool manager too */ + if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGTERM); +#endif pmState = PM_WAIT_BACKENDS; } @@ -2131,6 +2206,10 @@ pmdie(SIGNAL_ARGS) SignalChildren(SIGQUIT); if (StartupPID != 0) signal_child(StartupPID, SIGQUIT); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGQUIT); +#endif if (BgWriterPID != 0) signal_child(BgWriterPID, SIGQUIT); if (WalWriterPID != 0) @@ -2266,6 +2345,10 @@ reaper(SIGNAL_ARGS) PgArchPID = pgarch_start(); if (PgStatPID == 0) PgStatPID = pgstat_start(); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR && PgPoolerPID == 0) + PgPoolerPID = StartPoolManager(); +#endif /* at this point we are really open for business */ ereport(LOG, @@ -2403,6 +2486,21 @@ reaper(SIGNAL_ARGS) continue; } +#ifdef PGXC /* PGXC_COORD */ + /* + * Was it the pool manager? TODO decide how to handle + * Probably we should restart the system + */ + if (IS_PGXC_COORDINATOR && pid == PgPoolerPID) + { + PgPoolerPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("pool manager process")); + continue; + } +#endif + /* * Else do standard backend child cleanup. */ @@ -2594,6 +2692,23 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT)); } +#ifdef PGXC /* PGXC_COORD */ + /* Take care of the pool manager too */ + if (IS_PGXC_COORDINATOR) + { + if (pid == PgPoolerPID) + PgPoolerPID = 0; + else if (PgPoolerPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) PgPoolerPID))); + signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + } +#endif + /* * Force a power-cycle of the pgarch process too. (This isn't absolutely * necessary, but it seems like a good idea for robustness, and it @@ -2724,6 +2839,9 @@ PostmasterStateMachine(void) */ if (CountChildren() == 0 && StartupPID == 0 && +#ifdef PGXC /* PGXC_COORD */ + PgPoolerPID == 0 && +#endif (BgWriterPID == 0 || !FatalError) && WalWriterPID == 0 && AutoVacPID == 0) @@ -2798,6 +2916,9 @@ PostmasterStateMachine(void) PgArchPID == 0 && PgStatPID == 0) { /* These other guys should be dead already */ +#ifdef PGXC /* PGXC_COORD */ + Assert(PgPoolerPID == 0); +#endif Assert(StartupPID == 0); Assert(BgWriterPID == 0); Assert(WalWriterPID == 0); @@ -2942,6 +3063,9 @@ BackendStartup(Port *port) { Backend *bn; /* for backend cleanup */ pid_t pid; +#ifdef PGXC /* PGXC_COORD */ + PoolHandle *pool_handle; +#endif /* * Create backend data structure. Better before the fork() so we can @@ -2977,12 +3101,31 @@ BackendStartup(Port *port) else bn->child_slot = 0; +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + pool_handle = GetPoolManagerHandle(); + if (pool_handle == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("Can not connect to pool manager"))); + return STATUS_ERROR; + } + } +#endif + + #ifdef EXEC_BACKEND pid = backend_forkexec(port); #else /* !EXEC_BACKEND */ pid = fork_process(); if (pid == 0) /* child */ { + //// FOR DEBUG + printf("The session started: %d\n", getpid()); + //sleep(60); + //// FOR DEBUG free(bn); /* @@ -3005,11 +3148,25 @@ BackendStartup(Port *port) /* Perform additional initialization and client authentication */ BackendInitialize(port); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + /* User is authenticated and dbname is known at this point */ + PoolManagerConnect(pool_handle, port->database_name, GetAllNodes()); + InitGTM(); + } +#endif + /* And run the backend */ proc_exit(BackendRun(port)); } #endif /* EXEC_BACKEND */ +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + PoolManagerCloseHandle(pool_handle); +#endif + if (pid < 0) { /* in parent, fork failed */ @@ -4236,6 +4393,12 @@ StartChildProcess(AuxProcType type) errno = save_errno; switch (type) { +#ifdef PGXC /* PGXC_COORD */ + case PoolerProcess: + ereport(LOG, + (errmsg("could not fork pool manager process: %m"))); + break; +#endif case StartupProcess: ereport(LOG, (errmsg("could not fork startup process: %m"))); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e71b95c826..8ce8be820e 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -20,6 +20,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -38,6 +39,12 @@ #include "miscadmin.h" #include "storage/procarray.h" #include "utils/snapmgr.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "access/gtm.h" +/* PGXC_DATANODE */ +#include "postmaster/autovacuum.h" +#endif /* Our shared memory area */ @@ -90,6 +97,27 @@ static void DisplayXidCache(void); #define xc_slow_answer_inc() ((void) 0) #endif /* XIDCACHE_DEBUG */ +#ifdef PGXC /* PGXC_DATANODE */ +typedef enum +{ + SNAPSHOT_UNDEFINED, /* Coordinator has not sent snapshot or not yet connected */ + SNAPSHOT_LOCAL, /* Coordinator has instructed data node to build up snapshot from the local procarray */ + SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */ + SNAPSHOT_DIRECT /* Data Node obtained directly from GTM */ +} SnapshotSource; + +void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip); +void UnsetGlobalSnapshotData(void); +static bool GetSnapshotDataDataNode(Snapshot snapshot); +static bool GetSnapshotDataCoordinator(Snapshot snapshot); +/* Global snapshot data */ +static SnapshotSource snapshot_source = SNAPSHOT_UNDEFINED; +static int gxmin = InvalidTransactionId; +static int gxmax = InvalidTransactionId; +static int gxcnt = 0; +static int *gxip = NULL; +#endif + /* * Report shared-memory space needed by CreateSharedProcArray. @@ -682,6 +710,46 @@ GetSnapshotData(Snapshot snapshot) int count = 0; int subcount = 0; + +#ifdef PGXC /* PGXC_DATANODE */ + /* + * The typical case is that the coordinator passes down the snapshot to the + * data nodes to use, while it itselfs obtains them from GTM. + * The data nodes may however connect directly to GTM themselves to obtain + * XID and snapshot information for autovacuum worker threads. + */ + if (IS_PGXC_DATANODE) + { + if (GetSnapshotDataDataNode(snapshot)) + return snapshot; + /* else fallthrough */ + } else if (IS_PGXC_COORDINATOR) + { + if (GetSnapshotDataCoordinator(snapshot)) + return snapshot; + /* else fallthrough */ + } + + /* If we have no snapshot, we will use a local one. + * If we are in normal mode, we output a warning though. + * We currently fallback and use a local one at initdb time, + * as well as when a new connection occurs. + * IsPostmasterEnvironment - checks for initdb + * IsNormalProcessingMode() - checks for new connections + */ + if (IS_PGXC_DATANODE && snapshot_source == SNAPSHOT_UNDEFINED + && IsPostmasterEnvironment && IsNormalProcessingMode()) + { + elog(WARNING, "Do not have a GTM snapshot available"); + } +#endif + + /* + * Fallback to standard routine, calculate snapshot from local proc arrey + * if no master connection + */ + + Assert(snapshot != NULL); /* @@ -828,6 +896,9 @@ GetSnapshotData(Snapshot snapshot) snapshot->curcid = GetCurrentCommandId(false); +#ifdef PGXC + elog(DEBUG1, "Local snapshot is built, xmin: %d, xmax: %d, xcnt: %d, RecentGlobalXmin: %d", xmin, xmax, count, globalxmin); +#endif /* * This is a new snapshot, so set both refcounts are zero, and mark it as * not copied in persistent memory. @@ -1400,3 +1471,262 @@ DisplayXidCache(void) } #endif /* XIDCACHE_DEBUG */ + + +#ifdef PGXC +/* + * Store snapshot data received from the coordinator + */ +void +SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip) +{ + snapshot_source = SNAPSHOT_COORDINATOR; + gxmin = xmin; + gxmax = xmax; + gxcnt = xcnt; + if (gxip) + free(gxip); + gxip = xip; + elog (DEBUG1, "global snapshot info: gxmin: %d, gxmax: %d, gxcnt: %d", gxmin, gxmax, gxcnt); +} + +/* + * Force datanode to use local snapshot data + */ +void +UnsetGlobalSnapshotData(void) +{ + snapshot_source = SNAPSHOT_UNDEFINED; + gxmin = InvalidTransactionId; + gxmax = InvalidTransactionId; + gxcnt = 0; + if (gxip) + free(gxip); + gxip = NULL; + elog (DEBUG1, "unset snapshot info"); +} + +/* + * Get snapshot data for data node + * This is usually passed down from the coordinator + * + * returns whether or not to return immediately with snapshot + */ +static bool +GetSnapshotDataDataNode(Snapshot snapshot) +{ + Assert(IS_PGXC_DATANODE); + + + if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) + { + GTM_Snapshot gtm_snapshot; + bool canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable); + elog(DEBUG1, "Getting snapshot for autovacuum. Current XID = %d", GetCurrentTransactionId()); + gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionId(), canbe_grouped); + + if (!gtm_snapshot) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not obtain snapshot"))); + else { + snapshot_source = SNAPSHOT_DIRECT; + gxmin = gtm_snapshot->sn_xmin; + gxmax = gtm_snapshot->sn_xmax; + gxcnt = gtm_snapshot->sn_xcnt; + RecentGlobalXmin = gtm_snapshot->sn_recent_global_xmin; + if (gxip) + free(gxip); + if (gxcnt > 0) + { + gxip = malloc(gxcnt * 4); + if (gxip == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * 4); + } + else + gxip = NULL; + elog(DEBUG1, "for autovacuum from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d", + gxmin, gxmax, gxcnt, RecentGlobalXmin); + } + } + + if ((snapshot_source == SNAPSHOT_COORDINATOR || snapshot_source == SNAPSHOT_DIRECT) + && TransactionIdIsValid(gxmin)) + { + snapshot->xmin = gxmin; + snapshot->xmax = gxmax; + snapshot->xcnt = gxcnt; + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * This does open a possibility for avoiding repeated malloc/free: since + * maxProcs does not change at runtime, we can simply reuse the previous + * xip arrays if any. (This relies on the fact that all callers pass + * static SnapshotData structs.) */ + if (snapshot->xip == NULL) + { + ProcArrayStruct *arrayP = procArray; + /* + * First call for this snapshot + */ + snapshot->xip = (TransactionId *) + malloc(arrayP->maxProcs * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + memcpy(snapshot->xip, gxip, gxcnt * sizeof(TransactionId)); + snapshot->curcid = GetCurrentCommandId(false); + + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = gxmin; + + /* + * We should update RecentXmin here. But we have recently seen some + * issues with that - so skipping it for the time being. + * + * !!TODO + */ + RecentXmin = gxmin; + + /* PGXCTODO - set this until we handle subtransactions. */ + snapshot->subxcnt = 0; + + /* + * This is a new snapshot, so set both refcounts are zero, and mark it + * as not copied in persistent memory. + */ + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + return true; + } + return false; +} + +/* + * Get snapshot data for coordinator + * It will later be passed down to data nodes + * + * returns whether or not to return immediately with snapshot + */ +static bool +GetSnapshotDataCoordinator(Snapshot snapshot) +{ + bool canbe_grouped; + GTM_Snapshot gtm_snapshot; + + + Assert (IS_PGXC_COORDINATOR); + + canbe_grouped = (!FirstSnapshotSet) || (!IsXactIsoLevelSerializable); + gtm_snapshot = GetSnapshotGTM(GetCurrentGlobalTransactionId(), canbe_grouped); + + if (!gtm_snapshot) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not obtain snapshot"))); + else { + snapshot->xmin = gtm_snapshot->sn_xmin; + snapshot->xmax = gtm_snapshot->sn_xmax; + snapshot->recent_global_xmin = gtm_snapshot->sn_recent_global_xmin; + snapshot->xcnt = gtm_snapshot->sn_xcnt; + elog(DEBUG1, "from GTM: xmin = %d, xmax = %d, xcnt = %d, RecGlobXmin = %d", + snapshot->xmin, snapshot->xmax, snapshot->xcnt, snapshot->recent_global_xmin); + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * This does open a possibility for avoiding repeated malloc/free: since + * maxProcs does not change at runtime, we can simply reuse the previous + * xip arrays if any. (This relies on the fact that all callers pass + * static SnapshotData structs.) + */ + if (snapshot->xip == NULL) + { + ProcArrayStruct *arrayP = procArray; + /* + * First call for this snapshot + */ + snapshot->xip = (TransactionId *) + malloc(Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt) * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt); + + /* + * FIXME + * + * We really don't support subtransaction in PGXC right now, but + * when we would, we should fix the allocation below + */ + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + else if (snapshot->max_xcnt < gtm_snapshot->sn_xcnt) + { + snapshot->xip = (TransactionId *) + realloc(snapshot->xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = gtm_snapshot->sn_xcnt; + } + + memcpy(snapshot->xip, gtm_snapshot->sn_xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId)); + snapshot->curcid = GetCurrentCommandId(false); + + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = snapshot->xmin; + + /* + * We should update RecentXmin here. But we have recently seen some + * issues with that - so skipping it for the time being. + * + * !!TODO + */ + + /* PGXCTODO - set this until we handle subtransactions. */ + snapshot->subxcnt = 0; + /* + * This is a new snapshot, so set both refcounts are zero, and mark it + * as not copied in persistent memory. + */ + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + return true; + } + return false; +} +#endif /* PGXC */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 43e912f5cf..34b63041d1 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -71,7 +72,16 @@ #include "utils/snapmgr.h" #include "mb/pg_wchar.h" - +#ifdef PGXC +#include "storage/procarray.h" +#include "pgxc/pgxc.h" +#include "access/gtm.h" +/* PGXC_COORD */ +#include "pgxc/planner.h" +#include "pgxc/datanode.h" +/* PGXC_DATANODE */ +#include "access/transam.h" +#endif extern int optind; extern char *optarg; @@ -185,6 +195,27 @@ static void SigHupHandler(SIGNAL_ARGS); static void log_disconnections(int code, Datum arg); +#ifdef PGXC /* PGXC_DATANODE */ +static void pgxc_transaction_stmt (Node *parsetree); +static List * pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord); + +/* ---------------------------------------------------------------- + * PG-XC routines + * ---------------------------------------------------------------- + */ + +/* + * Called when the backend is ending. + */ +static void +DataNodeShutdown (int code, Datum arg) +{ + /* Close connection with GTM, if active */ + if (IsAutoVacuumWorkerProcess()) + CloseGTM(); +} +#endif + /* ---------------------------------------------------------------- * routines to obtain user input * ---------------------------------------------------------------- @@ -398,6 +429,11 @@ SocketBackend(StringInfo inBuf) (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type %d", qtype))); break; +#ifdef PGXC /* PGXC_DATANODE */ + case 'g': + case 's': + break; +#endif default: @@ -780,7 +816,6 @@ exec_simple_query(const char *query_string) bool isTopLevel; char msec_str[32]; - /* * Report query to various monitoring facilities. */ @@ -863,6 +898,22 @@ exec_simple_query(const char *query_string) Portal portal; DestReceiver *receiver; int16 format; +#ifdef PGXC + Query_Plan *query_plan; + Query_Step *query_step; + bool exec_on_coord; + + + /* + * By default we do not want data nodes to contact GTM directly, + * it should get this information passed down to it. + */ + if (IS_PGXC_DATANODE) + SetForceXidFromGTM(false); + + exec_on_coord = true; + query_plan = NULL; +#endif /* * Get the command name for use in status display (it also becomes the @@ -917,15 +968,53 @@ exec_simple_query(const char *query_string) querytree_list = pg_analyze_and_rewrite(parsetree, query_string, NULL, 0); - plantree_list = pg_plan_queries(querytree_list, 0, NULL); +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + if (IsA(parsetree, TransactionStmt)) + pgxc_transaction_stmt(parsetree); + + else if (IsA(parsetree, ExecDirectStmt)) + querytree_list = pgxc_execute_direct(parsetree, querytree_list, dest, snapshot_set, &exec_on_coord); + + else + { + query_plan = GetQueryPlan(parsetree, query_string, querytree_list); + + exec_on_coord = query_plan->exec_loc_type & EXEC_ON_COORD; + } + + /* First execute on the coordinator, if involved (DDL), then data nodes */ + } + + if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE) +#endif + plantree_list = pg_plan_queries(querytree_list, 0, NULL); /* Done with the snapshot used for parsing/planning */ +#ifdef PGXC + /* In PG-XC, hold on to it a bit longer */ +#else if (snapshot_set) PopActiveSnapshot(); +#endif /* If we got a cancel signal in analysis or planning, quit */ CHECK_FOR_INTERRUPTS(); +#ifdef PGXC + /* PGXC_DATANODE */ + /* Force getting Xid from GTM if not autovacuum, but a vacuum */ + if (IS_PGXC_DATANODE && IsA(parsetree, VacuumStmt) && IsPostmasterEnvironment) + SetForceXidFromGTM(true); + + /* PGXC_COORD */ + /* Force getting Xid from GTM if not autovacuum, but a vacuum */ + /* Skip the Portal stuff on coordinator if command only executes on data nodes */ + if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE) + { +#endif + /* * Create unnamed portal to run the query or queries in. If there * already is one, silently drop it. @@ -999,6 +1088,33 @@ exec_simple_query(const char *query_string) PortalDrop(portal, false); +#ifdef PGXC + } + + /* PGXC_COORD */ + /* If the coordinator ran ok, now run on the data nodes if planned */ + if (IS_PGXC_COORDINATOR) + { + if (query_plan && (query_plan->exec_loc_type & EXEC_ON_DATA_NODES)) + { + query_step = linitial(query_plan->query_step_list); + + DataNodeExec(query_step->sql_statement, + query_step->nodelist, + dest, + snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(), + query_plan->force_autocommit, + query_step->simple_aggregates, + IsA(parsetree, SelectStmt)); + } + + FreeQueryPlan(query_plan); + } + + if (snapshot_set) + PopActiveSnapshot(); +#endif /* PGXC_COORD */ + if (IsA(parsetree, TransactionStmt)) { /* @@ -1029,6 +1145,11 @@ exec_simple_query(const char *query_string) */ CommandCounterIncrement(); } +#ifdef PGXC /* PGXC_COORD */ + /* In case of PGXC handling client already received a response */ + if ((IS_PGXC_COORDINATOR && exec_on_coord) || IS_PGXC_DATANODE) + { +#endif /* * Tell client that we're done with this query. Note we emit exactly @@ -1037,6 +1158,9 @@ exec_simple_query(const char *query_string) * aborted by error will not send an EndCommand report at all.) */ EndCommand(completionTag, dest); +#ifdef PGXC /* PGXC_COORD */ + } +#endif } /* end loop over parsetrees */ /* @@ -2868,6 +2992,14 @@ PostgresMain(int argc, char *argv[], const char *username) sigjmp_buf local_sigjmp_buf; volatile bool send_ready_for_query = true; +#ifdef PGXC /* PGXC_DATANODE */ + /* Snapshot info */ + int xmin; + int xmax; + int xcnt; + int *xip; +#endif + #define PendingConfigOption(name,val) \ (guc_names = lappend(guc_names, pstrdup(name)), \ guc_values = lappend(guc_values, pstrdup(val))) @@ -2948,7 +3080,11 @@ PostgresMain(int argc, char *argv[], const char *username) * postmaster/postmaster.c (the option sets should not conflict) and with * the common help() function in main/main.c. */ +#ifdef PGXC + while ((flag = getopt(argc, argv, "A:B:Cc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:Xy:-:")) != -1) +#else while ((flag = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:y:-:")) != -1) +#endif { switch (flag) { @@ -2960,6 +3096,12 @@ PostgresMain(int argc, char *argv[], const char *username) SetConfigOption("shared_buffers", optarg, ctx, gucsource); break; +#ifdef PGXC + case 'C': + isPGXCCoordinator = true; + break; +#endif + case 'D': if (secure) userDoption = optarg; @@ -3082,7 +3224,11 @@ PostgresMain(int argc, char *argv[], const char *username) SetConfigOption("post_auth_delay", optarg, ctx, gucsource); break; - +#ifdef PGXC + case 'X': + isPGXCDataNode = true; + break; +#endif case 'y': /* @@ -3140,6 +3286,24 @@ PostgresMain(int argc, char *argv[], const char *username) } } +#ifdef PGXC + /* + * Make sure we specified the mode if Coordinator or Data Node. + * Allow for the exception of initdb by checking config option + */ + if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE && IsUnderPostmaster) + { + ereport(FATAL, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("PG-XC: must start as either a Coordinator (-C) or Data Node (-X)\n"))); + } + if (!IsPostmasterEnvironment) + { + /* Treat it as a data node for initdb to work properly */ + isPGXCDataNode = true; + } +#endif + /* * Process any additional GUC variable settings passed in startup packet. * These are handled exactly like command-line variables. @@ -3511,6 +3675,19 @@ PostgresMain(int argc, char *argv[], const char *username) if (!ignore_till_sync) send_ready_for_query = true; /* initially, or after error */ +#ifdef PGXC /* PGXC_COORD */ + if (IS_PGXC_COORDINATOR) + { + InitMultinodeExecutor(); + /* If we exit, first try and clean connections and send to pool */ + on_proc_exit (DataNodeCleanAndRelease, 0); + } + if (IS_PGXC_DATANODE) + { + /* If we exit, first try and clean connection to GTM */ + on_proc_exit (DataNodeShutdown, 0); + } +#endif /* * Non-error queries loop here. */ @@ -3560,6 +3737,15 @@ PostgresMain(int argc, char *argv[], const char *username) } ReadyForQuery(whereToSendOutput); +#ifdef PGXC + /* + * Helps us catch any problems where we did not send down a snapshot + * when it was expected. + */ + if (IS_PGXC_DATANODE) + UnsetGlobalSnapshotData(); +#endif + send_ready_for_query = false; } @@ -3832,6 +4018,42 @@ PostgresMain(int argc, char *argv[], const char *username) * is still sending data. */ break; +#ifdef PGXC /* PGXC_DATANODE */ + case 'g': /* gxid */ + { + /* Set the GXID we were passed down */ + TransactionId gxid = (TransactionId) pq_getmsgint(&input_message, 4); + elog(DEBUG1, "Received new gxid %u", gxid); + SetNextTransactionId(gxid); + pq_getmsgend(&input_message); + } + break; + + case 's': /* snapshot */ + /* Set the snapshot we were passed down */ + xmin = pq_getmsgint(&input_message, 4); + xmax = pq_getmsgint(&input_message, 4); + RecentGlobalXmin = pq_getmsgint(&input_message, 4); + xcnt = pq_getmsgint(&input_message, 4); + if (xcnt > 0) + { + int i; + xip = malloc(xcnt * 4); + if (xip == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + for (i = 0; i < xcnt; i++) + xip[i] = pq_getmsgint(&input_message, 4); + } + else + xip = NULL; + pq_getmsgend(&input_message); + SetGlobalSnapshotData(xmin, xmax, xcnt, xip); + break; +#endif /* PGXC */ default: ereport(FATAL, @@ -4023,3 +4245,117 @@ log_disconnections(int code, Datum arg) port->user_name, port->database_name, port->remote_host, port->remote_port[0] ? " port=" : "", port->remote_port))); } + + +#ifdef PGXC +/* + * Handle transaction statements in PG-XC + */ +void +pgxc_transaction_stmt (Node *parsetree) +{ + Assert(IS_PGXC_COORDINATOR); + + + /* Handle transaction statements specially */ + if (IsA(parsetree, TransactionStmt)) + { + TransactionStmt *stmt = (TransactionStmt *) parsetree; + + switch (stmt->kind) + { + case TRANS_STMT_BEGIN: + /* + * This does not yet send down a BEGIN, + * we do that "on demand" as data nodes are added + */ + DataNodeBegin(); + break; + + case TRANS_STMT_COMMIT: + DataNodeCommit(DestNone); + break; + + case TRANS_STMT_ROLLBACK: + DataNodeRollback(DestNone); + break; + + default: + /* Ignore others for prototype */ + break; + } + } +} + + +/* + * Handle EXECUTE DIRECT + */ +List * +pgxc_execute_direct (Node *parsetree, List *querytree_list, CommandDest dest, bool snapshot_set, bool *exec_on_coord) +{ + List *node_list = NIL; + List *parsetree_list; + ListCell *node_cell; + ExecDirectStmt *execdirect = (ExecDirectStmt *) parsetree; + bool on_coord = execdirect->coordinator; + + + Assert(IS_PGXC_COORDINATOR); + Assert(IsA(parsetree, ExecDirectStmt)); + + foreach (node_cell, execdirect->nodes) + { + int node_int = intVal(lfirst(node_cell)); + node_list = lappend_int(node_list, node_int); + } + if (node_list) + if (DataNodeExec(execdirect->query, + node_list, + dest, + snapshot_set ? GetActiveSnapshot() : GetTransactionSnapshot(), + FALSE, + FALSE, + FALSE) != 0) + on_coord = false; + + if (on_coord) + { + /* + * Parse inner statement, like at the begiining of the function + * We do not have to release wrapper trees, the message context + * will be deleted later + * Also, no need to switch context - current is already + * the MessageContext + */ + parsetree_list = pg_parse_query(execdirect->query); + + /* We do not want to log or display the inner command */ + + /* + * we do not support complex commands (expanded to multiple + * parse trees) within EXEC DIRECT + */ + if (list_length(parsetree_list) != 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Can not execute %s with EXECUTE DIRECT", + execdirect->query))); + } + + /* + * Get parse tree from the list + */ + parsetree = (Node *) lfirst(list_head(parsetree_list)); + + /* + * Build new query tree */ + querytree_list = pg_analyze_and_rewrite(parsetree, + execdirect->query, NULL, 0); + } + *exec_on_coord = on_coord; + + return querytree_list; +} +#endif /* PGXC */ diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f51f90f86b..28041c6305 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -2069,6 +2069,10 @@ CreateCommandTag(Node *parsetree) } } break; + + case T_ExecDirectStmt: + tag = "EXECUTE DIRECT"; + break; default: elog(WARNING, "unrecognized node type: %d", diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 775865d569..47ee10e682 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -33,6 +34,9 @@ #include "access/heapam.h" #include "access/reloptions.h" #include "access/sysattr.h" +#ifdef PGXC +#include "access/transam.h" +#endif #include "access/xact.h" #include "catalog/catalog.h" #include "catalog/index.h" @@ -54,6 +58,9 @@ #include "optimizer/planmain.h" #include "optimizer/prep.h" #include "optimizer/var.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif #include "rewrite/rewriteDefine.h" #include "storage/fd.h" #include "storage/lmgr.h" @@ -856,6 +863,10 @@ RelationBuildDesc(Oid targetRelId, Relation oldrelation) else relation->trigdesc = NULL; +#ifdef PGXC + if (IS_PGXC_COORDINATOR && relation->rd_id >= FirstNormalObjectId) + RelationBuildLocator(relation); +#endif /* * if it's an index, initialize index-related information */ diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 922c4a626f..5b70df1924 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION @@ -49,6 +50,9 @@ #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" +#ifdef PGXC +#include "catalog/pgxc_class.h" +#endif #include "utils/rel.h" #include "utils/syscache.h" @@ -524,6 +528,20 @@ static const struct cachedesc cacheinfo[] = { }, 64 }, +#ifdef PGXC + {PgxcClassRelationId, /* PGXCCLASSRELID */ + PgxcClassPgxcRelIdIndexId, + Anum_pgxc_class_pcrelid, + 1, + { + ObjectIdAttributeNumber, + 0, + 0, + 0 + }, + 1024 + }, +#endif {ProcedureRelationId, /* PROCNAMEARGSNSP */ ProcedureNameArgsNspIndexId, 0, diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 7bdfb67204..7063f6f5f6 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -50,7 +50,6 @@ ProcessingMode Mode = InitProcessing; /* Note: we rely on this to initialize as zeroes */ static char socketLockFile[MAXPGPATH]; - /* ---------------------------------------------------------------- * ignoring system indexes support stuff * diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 210bd6ba6a..c9f0a63418 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -7,6 +7,7 @@ * * * Copyright (c) 2000-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * Written by Peter Eisentraut <[email protected]>. * * IDENTIFICATION @@ -27,6 +28,9 @@ #endif #include "access/gin.h" +#ifdef PGXC +#include "access/gtm.h" +#endif #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -50,6 +54,11 @@ #include "parser/parse_type.h" #include "parser/scansup.h" #include "pgstat.h" +#ifdef PGXC +#include "pgxc/locator.h" +#include "pgxc/planner.h" +#include "pgxc/poolmgr.h" +#endif #include "postmaster/autovacuum.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" @@ -532,6 +541,12 @@ const char *const config_group_names[] = gettext_noop("Customized Options"), /* DEVELOPER_OPTIONS */ gettext_noop("Developer Options"), +#ifdef PGXC + /* DATA_NODES */ + gettext_noop("Data Nodes and Connection Pooling"), + /* GTM */ + gettext_noop("GTM Connection"), +#endif /* help_config wants this array to be null-terminated */ NULL }; @@ -1220,7 +1235,38 @@ static struct config_bool ConfigureNamesBool[] = &IgnoreSystemIndexes, false, NULL, NULL }, - +#ifdef PGXC + { + {"persistent_datanode_connections", PGC_BACKEND, DEVELOPER_OPTIONS, + gettext_noop("Session never releases acquired connections."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &PersistentConnections, + false, NULL, NULL + }, + { + {"strict_statement_checking", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Forbid statements that are not safe for the cluster"), + NULL + }, + &StrictStatementChecking, + true, NULL, NULL + }, + { + /* + * This is temporary work-around until we allow for a merge-sort of + * ORDER BY. + */ + {"strict_select_checking", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Forbid if SELECT has ORDER BY"), + gettext_noop("and is not safe for the cluster"), + GUC_NOT_IN_SAMPLE + }, + &StrictSelectChecking, + false, NULL, NULL + }, +#endif /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL @@ -1255,7 +1301,7 @@ static struct config_int ConfigureNamesInt[] = gettext_noop("This applies to table columns that have not had a " "column-specific target set via ALTER TABLE SET STATISTICS.") }, - &default_statistics_target, + &default_statistics_target, 100, 1, 10000, NULL, NULL }, { @@ -1504,7 +1550,11 @@ static struct config_int ConfigureNamesInt[] = NULL }, &max_prepared_xacts, +#ifdef PGXC + 10, 0, INT_MAX / 4, NULL, NULL +#else 0, 0, INT_MAX / 4, NULL, NULL +#endif }, #ifdef LOCK_DEBUG @@ -1951,7 +2001,63 @@ static struct config_int ConfigureNamesInt[] = &pgstat_track_activity_query_size, 1024, 100, 102400, NULL, NULL }, +#ifdef PGXC + { + {"num_data_nodes", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Number of data nodes."), + NULL + }, + &NumDataNodes, + 2, 1, 65535, NULL, NULL + }, + { + {"min_pool_size", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Initial pool size."), + gettext_noop("If number of active connections decreased below this value, " + "new connections are established") + }, + &MinPoolSize, + 1, 1, 65535, NULL, NULL + }, + + { + {"max_pool_size", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Max pool size."), + gettext_noop("If number of active connections reaches this value, " + "other connection requests will be refused") + }, + &MaxPoolSize, + 100, 1, 65535, NULL, NULL + }, + + { + {"pooler_port", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Port of the Pool Manager."), + NULL + }, + &PoolerPort, + 6667, 1, 65535, NULL, NULL + }, + + { + {"gtm_port", PGC_POSTMASTER, GTM, + gettext_noop("Port of GTM."), + NULL + }, + &GtmPort, + 6666, 1, 65535, NULL, NULL + }, + + { + {"gtm_coordinator_id", PGC_POSTMASTER, GTM, + gettext_noop("The Coordinator Identifier."), + NULL + }, + &GtmCoordinatorId, + 1, 1, INT_MAX, NULL, NULL + }, +#endif /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL @@ -2502,6 +2608,65 @@ static struct config_string ConfigureNamesString[] = "pg_catalog.simple", assignTSCurrentConfig, NULL }, +#ifdef PGXC + { + {"preferred_data_nodes", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Preferred data nodes."), + gettext_noop("A list of data nodes to read from replicated tables") + }, + &PreferredDataNodes, + "", NULL, NULL + }, + + { + {"data_node_hosts", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Host names or addresses of data nodes."), + gettext_noop("Comma separated list or single value, " + "if all data nodes on the same host") + }, + &DataNodeHosts, + "localhost", NULL, NULL + }, + + { + {"data_node_ports", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Port numbers of data nodes."), + gettext_noop("Comma separated list or single value, " + "if all data nodes listen on the same port") + }, + &DataNodePorts, + "15432,25432", NULL, NULL + }, + + { + {"data_node_users", PGC_POSTMASTER, DATA_NODES, + gettext_noop("User names or addresses of data nodes."), + gettext_noop("Comma separated list or single value, " + "if user names are the same on all data nodes") + }, + &DataNodeUsers, + "postgres", NULL, NULL + }, + + { + {"data_node_passwords", PGC_POSTMASTER, DATA_NODES, + gettext_noop("Passwords of data nodes."), + gettext_noop("Comma separated list or single value, " + "if passwords are the same on all data nodes") + }, + &DataNodePwds, + "postgres", NULL, NULL + }, + + { + {"gtm_host", PGC_POSTMASTER, GTM, + gettext_noop("Host name or address of GTM"), + NULL + }, + &GtmHost, + "localhost", NULL, NULL + }, +#endif #ifdef USE_SSL { {"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 3f7b43f0cc..e46670cd91 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -106,7 +106,7 @@ #shared_buffers = 32MB # min 128kB # (change requires restart) #temp_buffers = 8MB # min 800kB -#max_prepared_transactions = 0 # zero disables the feature +#max_prepared_transactions = 10 # zero disables the feature # (change requires restart) # Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory # per transaction slot, plus lock space (see max_locks_per_transaction). @@ -490,9 +490,61 @@ #transform_null_equals = off +#------------------------------------------------------------------------------ +# DATA NODES AND CONNECTION POOLING +#------------------------------------------------------------------------------ + +#pooler_port = 6667 # Pool Manager TCP port + # (change requires restart) +#num_data_nodes = 2 # Number of Data Nodes + # (change requires restart) +#preferred_data_nodes = '' # List of preferred Data Nodes to read from + # replicated tables. If empty use all the data nodes + # (change requires restart) +#min_pool_size = 1 # Initial pool size + # (change requires restart) +#max_pool_size = 100 # Maximum pool size + # (change requires restart) +#persistent_datanode_connections = off # Set persistent connection mode for pooler + # if set at on, connections taken for coordinator + # are not put back to pool +#data_node_hosts = 'localhost' # Host names or addresses of data nodes + # (change requires restart) +#data_node_ports = '15432,25432' # Port numbers of data nodes + # (change requires restart) +#data_node_users = 'postgres' # User names of data nodes + # (change requires restart) +#data_node_passwords = 'postgres' # Passwords of data nodes + # (change requires restart) +# Note each adata_node_... value should be either a single value if respective +# parameter is the same on all nodes or a comma-separated list, with number of +# entries not less then number of nodes end each entry is a value for node with +# respective number between 1 and num_data_nodes. If list is longer then +# num_data_nodes extra values are ignored. #------------------------------------------------------------------------------ +# GTM CONNECTION +#------------------------------------------------------------------------------ + +#gtm_host = 'localhost' # Host name or address of GTM + # (change requires restart) +#gtm_port = 6666 # Port of GTM + # (change requires restart) +#gtm_coordinator_id = 1 # Coordinator identifier + # (change requires restart) + +##------------------------------------------------------------------------------ +# OTHER PG-XC OPTIONS +#------------------------------------------------------------------------------ +#strict_statement_checking = on # Forbid PG-XC-unsafe SQL + # Enabling is useful for development +#strict_select_checking = off # Temporary; be strict about allowing + # multi-node ORDER BY + + +##------------------------------------------------------------------------------ # CUSTOMIZED OPTIONS #------------------------------------------------------------------------------ #custom_variable_classes = '' # list of custom variable class names + diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 3fc9b3880e..0677b09660 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -40,6 +40,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * Portions taken from FreeBSD. * * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.172 2009/06/11 14:49:07 momjian Exp $ @@ -62,6 +63,7 @@ #include "getopt_long.h" #include "miscadmin.h" +#include "postgres.h" /* * these values are passed in by makefile defines @@ -3179,14 +3181,34 @@ main(int argc, char *argv[]) strcpy(bin_dir, argv[0]); get_parent_directory(bin_dir); - printf(_("\nSuccess. You can now start the database server using:\n\n" - " %s%s%spostgres%s -D %s%s%s\n" + +#ifdef PGXC + printf(_("\nSuccess.\n You can now start the database server of the Postgres-XC coordinator using:\n\n" + " %s%s%spostgres%s -C -D %s%s%s\n" "or\n" - " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"), + " %s%s%spg_ctl%s start -D %s%s%s -S coordinator -l logfile\n\n" + " You can now start the database server of the Postgres-XC datanode using:\n\n" + " %s%s%spostgres%s -X -D %s%s%s\n" + "or \n" + " %s%s%spg_ctl%s start -D %s%s%s -S datanode -l logfile\n\n"), + QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, + QUOTE_PATH, pg_data_native, QUOTE_PATH, + QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, + QUOTE_PATH, pg_data_native, QUOTE_PATH, QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, QUOTE_PATH, pg_data_native, QUOTE_PATH, QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, QUOTE_PATH, pg_data_native, QUOTE_PATH); +#else + printf(_("\nSuccess. You can now start the database server of datanode using:\n\n" + " %s%s%spostgres%s -D %s%s%s\n" + "or\n" + " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"), + QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, + QUOTE_PATH, pg_data_native, QUOTE_PATH, + QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, + QUOTE_PATH, pg_data_native, QUOTE_PATH); +#endif return 0; } diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 40ede2c1a8..3e06bd4132 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -3,6 +3,7 @@ * pg_ctl --- start/stops/restarts the PostgreSQL server * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/bin/pg_ctl/pg_ctl.c,v 1.111 2009/06/11 14:49:07 momjian Exp $ * @@ -58,8 +59,8 @@ typedef enum { NO_COMMAND = 0, START_COMMAND, - STOP_COMMAND, RESTART_COMMAND, + STOP_COMMAND, RELOAD_COMMAND, STATUS_COMMAND, KILL_COMMAND, @@ -88,6 +89,9 @@ static char *register_username = NULL; static char *register_password = NULL; static char *argv0 = NULL; static bool allow_core_files = false; +#ifdef PGXC +static char *pgxcCommand = NULL; +#endif static void write_stderr(const char *fmt,...) @@ -357,12 +361,23 @@ start_postmaster(void) * everything to a shell to process them. */ if (log_file != NULL) +#ifdef PGXC + snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, + postgres_path, pgxcCommand, pgdata_opt, post_opts, + DEVNULL, log_file); +#else snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, postgres_path, pgdata_opt, post_opts, DEVNULL, log_file); +#endif else +#ifdef PGXC + snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, + postgres_path, pgxcCommand, pgdata_opt, post_opts, DEVNULL); +#else snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, postgres_path, pgdata_opt, post_opts, DEVNULL); +#endif return system(cmd); #else /* WIN32 */ @@ -1520,16 +1535,22 @@ do_help(void) printf(_("%s is a utility to start, stop, restart, reload configuration files,\n" "report the status of a PostgreSQL server, or signal a PostgreSQL process.\n\n"), progname); printf(_("Usage:\n")); +#ifdef PGXC + printf(_(" %s start [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname); + printf(_(" %s restart [-w] [-t SECS] [-S NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n" + " [-o \"OPTIONS\"]\n"), progname); +#else printf(_(" %s start [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname); - printf(_(" %s stop [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname); printf(_(" %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n" - " [-o \"OPTIONS\"]\n"), progname); + " [-o \"OPTIONS\"]\n"), progname); +#endif + printf(_(" %s stop [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname); printf(_(" %s reload [-D DATADIR] [-s]\n"), progname); printf(_(" %s status [-D DATADIR]\n"), progname); printf(_(" %s kill SIGNALNAME PID\n"), progname); #if defined(WIN32) || defined(__CYGWIN__) printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n" - " [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname); + " [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname); printf(_(" %s unregister [-N SERVICENAME]\n"), progname); #endif @@ -1537,6 +1558,9 @@ do_help(void) printf(_(" -D, --pgdata DATADIR location of the database storage area\n")); printf(_(" -s, --silent only print errors, no informational messages\n")); printf(_(" -t SECS seconds to wait when using -w option\n")); +#ifdef PGXC + printf(_(" -S NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XC)\n")); +#endif printf(_(" -w wait until operation completes\n")); printf(_(" -W do not wait until operation completes\n")); printf(_(" --help show this help, then exit\n")); @@ -1715,7 +1739,11 @@ main(int argc, char **argv) /* process command-line options */ while (optind < argc) { +#ifdef PGXC + while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:S:st:U:wW", long_options, &option_index)) != -1) +#else while ((c = getopt_long(argc, argv, "cD:l:m:N:o:p:P:st:U:wW", long_options, &option_index)) != -1) +#endif { switch (c) { @@ -1759,6 +1787,13 @@ main(int argc, char **argv) case 'P': register_password = xstrdup(optarg); break; +#ifdef PGXC + case 'S': + if (strcmp(optarg, "coordinator") == 0) + pgxcCommand = strdup("-C"); + else if (strcmp(optarg, "datanode") == 0) + pgxcCommand = strdup("-X"); +#endif case 's': silent_mode = true; break; @@ -1808,13 +1843,12 @@ main(int argc, char **argv) do_advice(); exit(1); } - if (strcmp(argv[optind], "start") == 0) ctl_command = START_COMMAND; - else if (strcmp(argv[optind], "stop") == 0) - ctl_command = STOP_COMMAND; else if (strcmp(argv[optind], "restart") == 0) ctl_command = RESTART_COMMAND; + else if (strcmp(argv[optind], "stop") == 0) + ctl_command = STOP_COMMAND; else if (strcmp(argv[optind], "reload") == 0) ctl_command = RELOAD_COMMAND; else if (strcmp(argv[optind], "status") == 0) @@ -1856,6 +1890,18 @@ main(int argc, char **argv) exit(1); } +#ifdef PGXC + /* stop command does not need to have coordinator or datanode options */ + if ((ctl_command == START_COMMAND || ctl_command == RESTART_COMMAND) + && !pgxcCommand) + { + write_stderr(_("%s: coordinator or datanode option not specified (-S)\n"), + progname); + do_advice(); + exit(1); + } +#endif + /* Note we put any -D switch into the env var above */ pg_data = getenv("PGDATA"); if (pg_data) @@ -1912,12 +1958,12 @@ main(int argc, char **argv) case START_COMMAND: do_start(); break; - case STOP_COMMAND: - do_stop(); - break; case RESTART_COMMAND: do_restart(); break; + case STOP_COMMAND: + do_stop(); + break; case RELOAD_COMMAND: do_reload(); break; diff --git a/src/gtm/Makefile b/src/gtm/Makefile new file mode 100644 index 0000000000..51c55e0dd5 --- /dev/null +++ b/src/gtm/Makefile @@ -0,0 +1,43 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/gtm +# GTM and GTM proxy +# +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation +#------------------------------------------------------------------------- + +PGFILEDESC = "gtm - Global Transaction Manager for Postgres-XC" +subdir = src/gtm +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +WANTED_DIRS=common path libpq client main proxy gtm_ctl + +all: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +clobber: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +clean: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +distclean: clean + +maintainer-clean: distclean + +install: all + $(INSTALL_PROGRAM) ./main/gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)' + $(INSTALL_PROGRAM) ./gtm_ctl/gtm_ctl$(X) '$(DESTDIR)$(bindir)/gtm_ctl$(X)' + $(INSTALL_PROGRAM) ./proxy/gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)' + +uninstall: + rm -f $(DESTDIR)$(bindir)/gtm$(X) + rm -f $(DESTDIR)$(bindir)/gtm_ctl$(X) + rm -f $(DESTDIR)$(bindir)/gtm_proxy$(X) diff --git a/src/gtm/Makefile.global b/src/gtm/Makefile.global new file mode 100644 index 0000000000..f130bdbd7f --- /dev/null +++ b/src/gtm/Makefile.global @@ -0,0 +1,116 @@ + +########################################################################## +# +# Meta configuration +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +.PHONY: all install install-strip installdirs uninstall clean distclean check installcheck +.SILENT: installdirs + +# make `all' the default target +all: + +# Delete target files if the command fails after it has +# started to update the file. +.DELETE_ON_ERROR: + +# PostgreSQL version number +VERSION = 1.0Beta +MAJORVERSION = 1.0 + +top_srcdir=$(top_build_dir) + +enable_shared = yes + +########################################################################## +# +# Programs and flags + +# Compilers + +CPP = gcc -E +CPPFLAGS = -D_GNU_SOURCE + +override CPPFLAGS := -I$(top_srcdir)/include $(CPPFLAGS) + +CC = gcc +GCC = yes +SUN_STUDIO_CC = no +CFLAGS = $(DEBUGFLAGS) -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -fno-strict-aliasing -fwrapv + +# Kind-of compilers + +BISON = bison +BISONFLAGS = $(YFLAGS) +FLEX = /usr/bin/flex +FLEXFLAGS = $(LFLAGS) +DTRACE = +DTRACEFLAGS = +ZIC = + +# Linking + +AR = ar +DLLTOOL = +DLLWRAP = +LIBS = -lz -lreadline -lcrypt -ldl -lm -lpthread +LDAP_LIBS_FE = +LDAP_LIBS_BE = +OSSP_UUID_LIBS = +LD = /usr/bin/ld +with_gnu_ld = yes +ld_R_works = +LDFLAGS = -Wl,--as-needed +LDFLAGS_SL = +LDREL = -r +LDOUT = -o +RANLIB = ranlib +WINDRES = +X = + +# Perl + +# quoted for pathname with spaces +PERL = "/usr/bin/perl" +perl_archlibexp = +perl_privlibexp = +perl_useshrplib = +perl_embed_ldflags = + +# Miscellaneous + +AWK = gawk +LN_S = ln -s +MSGFMT = +MSGMERGE = +PYTHON = +TAR = /bin/tar +XGETTEXT = + +GZIP = gzip +BZIP2 = bzip2 + +PL_TESTDB = pl_regression +CONTRIB_TESTDB = contrib_regression + + + +########################################################################## +# +# Additional platform-specific settings +# + +# Name of the "template" +PORTNAME= linux + + +# Set up rpath if enabled. By default it will point to our libdir, +# but individual Makefiles can force other rpath paths if needed. +rpathdir = $(libdir) + +ifeq ($(enable_rpath), yes) +LDFLAGS += $(rpath) +endif + +include $(top_build_dir)/gtm/Makefile.port + diff --git a/src/gtm/Makefile.port b/src/gtm/Makefile.port new file mode 100644 index 0000000000..611c8b7766 --- /dev/null +++ b/src/gtm/Makefile.port @@ -0,0 +1,16 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +AROPT = crs +export_dynamic = -Wl,-E +rpath = -Wl,-rpath,'$(rpathdir)' +allow_nonpic_in_shlib = yes +DLSUFFIX = .so + +ifeq "$(findstring sparc,$(host_cpu))" "sparc" +CFLAGS_SL = -fPIC +else +CFLAGS_SL = -fpic +endif + +%.so: %.o + $(CC) $(CFLAGS) -shared -o $@ $< diff --git a/src/gtm/Makefile.shlib b/src/gtm/Makefile.shlib new file mode 100644 index 0000000000..83aca3896b --- /dev/null +++ b/src/gtm/Makefile.shlib @@ -0,0 +1,556 @@ +#------------------------------------------------------------------------- +# +# Makefile.shlib +# Common rules for building shared libraries +# +# Copyright (c) 1998, Regents of the University of California +# Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL: pgsql/src/Makefile.shlib,v 1.119 2008/12/11 07:34:07 petere Exp $ +# +#------------------------------------------------------------------------- + +# This file should be included by any Postgres module Makefile that +# wants to build a shared library (if possible for the current +# platform). A static library is also built from the same object +# files. Only one library can be built per makefile. +# +# Before including this file, the module Makefile must define these +# variables: +# +# NAME Name of library to build (no suffix nor "lib" prefix) +# OBJS List of object files to include in library +# SHLIB_LINK If shared library relies on other libraries, +# additional stuff to put in its link command +# SHLIB_EXPORTS (optional) Name of file containing list of symbols to +# export +# +# When building a shared library, the following version information +# must also be set. It should be omitted when building a dynamically +# loadable module. +# +# SO_MAJOR_VERSION Major version number to use for shared library +# SO_MINOR_VERSION Minor version number to use for shared library +# (If you want a patchlevel, include it in SO_MINOR_VERSION, e.g., "6.2".) +# +# Optional flags when building DLL's (only applicable to win32 and cygwin +# platforms). +# DLLTOOL_DEFFLAGS Additional flags when creating the dll .def file +# DLLTOOL_LIBFLAGS Additional flags when creating the lib<module>.a file +# DLLWRAP_FLAGS Additional flags to dllwrap +# +# The module Makefile must also include +# $(top_builddir)/src/Makefile.global before including this file. +# (Makefile.global sets PORTNAME and other needed symbols.) +# +# This makefile provides the following (phony) targets: +# +# all-lib build the static and shared (if applicable) libraries +# install-lib install the libraries into $(libdir) +# installdirs-lib create installation directory $(libdir) +# uninstall-lib remove the libraries from $(libdir) +# clean-lib delete the static and shared libraries from the build dir +# maintainer-clean-lib delete .def files built for win32 +# +# Since `all-lib' is the first rule in this file you probably want to +# have the `all' target before including this file. In the most simple +# case it would look like this: +# +# all: all-lib +# +# Similarly, the install rule might look like +# +# install: install-lib +# +# plus any additional things you want to install. Et cetera. +# +# Got that? Look at src/interfaces/libpq/Makefile for an example. +# +# While the linker allows creation of most shared libraries, +# -Bsymbolic requires resolution of all symbols, making the +# compiler a better choice for shared library creation on ELF platforms. +# With the linker, -Bsymbolic requires the crt1.o startup object file. +# bjm 2001-02-10 + + +COMPILER = $(CC) $(CFLAGS) +LINK.static = $(AR) $(AROPT) + + + +# Insert -L from LDFLAGS after any -L already present in SHLIB_LINK +SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) $(filter -L%, $(LDFLAGS)) $(filter-out -L%, $(SHLIB_LINK)) + +# Need a -L-free version of LDFLAGS to use in combination with SHLIB_LINK +LDFLAGS_NO_L = $(filter-out -L%, $(LDFLAGS)) + +ifdef SO_MAJOR_VERSION +# Default library naming convention used by the majority of platforms +ifeq ($(enable_shared), yes) +shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) +shlib_major = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION) +shlib_bare = lib$(NAME)$(DLSUFFIX) +endif +# Testing the soname variable is a reliable way to determine whether a +# linkable library is being built. +soname = $(shlib_major) +else +# Naming convention for dynamically loadable modules +ifeq ($(enable_shared), yes) +shlib = $(NAME)$(DLSUFFIX) +endif +endif +stlib = lib$(NAME).a + +ifndef soname +# additional flags for backend modules +SHLIB_LINK := $(BE_DLLLIBS) $(SHLIB_LINK) +endif + +# For each platform we support shared libraries on, set shlib to the +# name of the library (if default above is not right), set +# LINK.shared to the command to link the library, +# and adjust SHLIB_LINK if necessary. + +# Try to keep the sections in some kind of order, folks... + +override CFLAGS += $(CFLAGS_SL) +ifdef SO_MAJOR_VERSION +# libraries ought to use this to refer to versioned gettext domain names +override CPPFLAGS += -DSO_MAJOR_VERSION=$(SO_MAJOR_VERSION) +endif + +ifeq ($(PORTNAME), aix) + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION) + endif + haslibarule = yes + exports_file = lib$(NAME).exp +endif + +ifeq ($(PORTNAME), darwin) + ifdef soname + # linkable library + DLSUFFIX = .dylib + ifneq ($(SO_MAJOR_VERSION), 0) + version_link = -compatibility_version $(SO_MAJOR_VERSION) -current_version $(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) + endif + LINK.shared = $(COMPILER) -dynamiclib -install_name $(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) $(version_link) $(exported_symbols_list) -multiply_defined suppress + shlib = lib$(NAME).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION)$(DLSUFFIX) + shlib_major = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) + else + # loadable module + DLSUFFIX = .so + LINK.shared = $(COMPILER) -bundle -multiply_defined suppress + endif + BUILD.exports = $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@ + exports_file = $(SHLIB_EXPORTS:%.txt=%.list) + ifneq (,$(exports_file)) + exported_symbols_list = -exported_symbols_list $(exports_file) + endif +endif + +ifeq ($(PORTNAME), openbsd) + ifdef ELF_SYSTEM + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-x,-soname,$(soname) + endif + SHLIB_LINK += -lc + else + LINK.shared = $(LD) -x -Bshareable -Bforcearchive + endif +endif + +ifeq ($(PORTNAME), bsdi) + ifeq ($(DLSUFFIX), .so) + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-x,-soname,$(soname) + endif + SHLIB_LINK += -lc + endif + ifeq ($(DLSUFFIX), .o) + LINK.shared = shlicc -O $(LDREL) + endif +endif + +ifeq ($(PORTNAME), freebsd) + ifdef ELF_SYSTEM + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION) + endif + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-x,-soname,$(soname) + endif + else + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) + endif + LINK.shared = $(LD) -x -Bshareable -Bforcearchive + endif +endif + +ifeq ($(PORTNAME), netbsd) + ifdef ELF_SYSTEM + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-x,-soname,$(soname) + endif + else + LINK.shared = $(LD) -x -Bshareable -Bforcearchive + endif +endif + +ifeq ($(PORTNAME), hpux) + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION) + endif + ifeq ($(with_gnu_ld), yes) + LINK.shared = $(CC) $(LDFLAGS_NO_L) -shared + ifdef soname + LINK.shared += -Wl,-h -Wl,$(soname) + endif + else + # can't use the CC-syntax rpath pattern here + rpath = + LINK.shared = $(LD) -b + ifdef soname + LINK.shared += +h $(soname) + endif + ifeq ($(enable_rpath), yes) + LINK.shared += +b '$(rpathdir)' + endif + # On HPUX platforms, gcc is usually configured to search for libraries + # in /usr/local/lib, but ld won't do so. Add an explicit -L switch so + # ld can find the same libraries gcc does. Make sure it goes after any + # -L switches provided explicitly. + ifeq ($(GCC), yes) + SHLIB_LINK := $(filter -L%, $(SHLIB_LINK)) -L/usr/local/lib $(filter-out -L%, $(SHLIB_LINK)) + endif + endif + # do this last so above filtering doesn't pull out -L switches in LDFLAGS + ifeq ($(GCC), yes) + SHLIB_LINK += `$(CC) $(LDFLAGS) -print-libgcc-file-name` + endif +endif + +ifeq ($(PORTNAME), irix) + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX).$(SO_MAJOR_VERSION) + endif + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-set_version,sgi$(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) + endif +endif + +ifeq ($(PORTNAME), linux) + LINK.shared = $(COMPILER) -shared + ifdef soname + LINK.shared += -Wl,-soname,$(soname) + endif + BUILD.exports = ( echo '{ global:'; $(AWK) '/^[^\#]/ {printf "%s;\n",$$1}' $<; echo ' local: *; };' ) >$@ + exports_file = $(SHLIB_EXPORTS:%.txt=%.list) + ifneq (,$(exports_file)) + LINK.shared += -Wl,--version-script=$(exports_file) + endif +endif + +ifeq ($(PORTNAME), solaris) + ifeq ($(GCC), yes) + LINK.shared = $(COMPILER) -shared + else + LINK.shared = $(COMPILER) -G + endif + ifdef soname + ifeq ($(with_gnu_ld), yes) + LINK.shared += -Wl,-soname,$(soname) + else + LINK.shared += -h $(soname) + endif + endif +endif + +ifeq ($(PORTNAME), sunos4) + LINK.shared = $(LD) -assert pure-text -Bdynamic +endif + +ifeq ($(PORTNAME), osf) + LINK.shared = $(LD) -shared -expect_unresolved '*' +endif + +ifeq ($(PORTNAME), sco) + ifeq ($(GCC), yes) + LINK.shared = $(CC) -shared + else + LINK.shared = $(CC) -G + endif + LINK.shared += -Wl,-z,text + ifdef soname + LINK.shared += -Wl,-h,$(soname) + endif +endif + +ifeq ($(PORTNAME), svr4) + LINK.shared = $(LD) -G +endif + +ifeq ($(PORTNAME), univel) + LINK.shared = $(LD) -G -z text +endif + +ifeq ($(PORTNAME), unixware) + ifeq ($(GCC), yes) + LINK.shared = $(CC) -shared + else + LINK.shared = $(CC) -G + endif + LINK.shared += -Wl,-z,text + ifdef soname + LINK.shared += -Wl,-h,$(soname) + endif +endif + +ifeq ($(PORTNAME), cygwin) + ifdef SO_MAJOR_VERSION + shlib = cyg$(NAME)$(DLSUFFIX) + endif + haslibarule = yes +endif + +ifeq ($(PORTNAME), win32) + ifdef SO_MAJOR_VERSION + shlib = lib$(NAME)$(DLSUFFIX) + endif + haslibarule = yes +endif + +ifeq ($(enable_rpath), yes) +SHLIB_LINK += $(rpath) +endif + + + +## +## BUILD +## + +.PHONY: all-lib all-static-lib all-shared-lib + +all-lib: all-shared-lib +ifdef soname +# no static library when building a dynamically loadable module +all-lib: all-static-lib +endif + +all-static-lib: $(stlib) + +all-shared-lib: $(shlib) + +ifndef haslibarule +$(stlib): $(OBJS) + $(LINK.static) $@ $^ + $(RANLIB) $@ +endif #haslibarule + +ifeq ($(enable_shared), yes) + +ifeq (,$(filter cygwin win32,$(PORTNAME))) +ifneq ($(PORTNAME), aix) + +# Normal case +$(shlib): $(OBJS) + $(LINK.shared) $(LDFLAGS_SL) $(OBJS) $(SHLIB_LINK) -o $@ +ifdef shlib_major +# If we're using major and minor versions, then make a symlink to major-version-only. +ifneq ($(shlib), $(shlib_major)) + rm -f $(shlib_major) + $(LN_S) $(shlib) $(shlib_major) +endif +# Make sure we have a link to a name without any version numbers +ifneq ($(shlib), $(shlib_bare)) + rm -f $(shlib_bare) + $(LN_S) $(shlib) $(shlib_bare) +endif +endif # shlib_major + +# Where possible, restrict the symbols exported by the library to just the +# official list, so as to avoid unintentional ABI changes. On recent Darwin +# this also quiets multiply-defined-symbol warnings in programs that use +# libpgport along with libpq. +ifneq (,$(SHLIB_EXPORTS)) +ifdef BUILD.exports +$(shlib): $(SHLIB_EXPORTS:%.txt=%.list) + +$(SHLIB_EXPORTS:%.txt=%.list): %.list: %.txt + $(BUILD.exports) +endif +endif + +else # PORTNAME == aix + +# AIX case +$(shlib) $(stlib): $(OBJS) + $(LINK.static) $(stlib) $^ + $(RANLIB) $(stlib) + $(MKLDEXPORT) $(stlib) >$(exports_file) + $(COMPILER) $(LDFLAGS_NO_L) $(LDFLAGS_SL) -o $(shlib) $(stlib) -Wl,-bE:$(exports_file) $(SHLIB_LINK) + rm -f $(stlib) + $(AR) $(AROPT) $(stlib) $(shlib) + +endif # PORTNAME == aix + +else # PORTNAME == cygwin || PORTNAME == win32 + +# Cygwin or Win32 case + +# If SHLIB_EXPORTS is set, the rules below will build a .def file from +# that. Else we build a temporary one here. +ifeq (,$(SHLIB_EXPORTS)) +DLL_DEFFILE = lib$(NAME)dll.def +exports_file = $(DLL_DEFFILE) + +$(exports_file): $(OBJS) + $(DLLTOOL) --export-all $(DLLTOOL_DEFFLAGS) --output-def $@ $^ +else +DLL_DEFFILE = $(srcdir)/lib$(NAME)dll.def +endif + +$(shlib): $(OBJS) $(DLL_DEFFILE) + $(DLLWRAP) $(LDFLAGS_SL) -o $@ --dllname $(shlib) $(DLLWRAP_FLAGS) --def $(DLL_DEFFILE) $(OBJS) $(SHLIB_LINK) + +$(stlib): $(shlib) $(DLL_DEFFILE) + $(DLLTOOL) --dllname $(shlib) $(DLLTOOL_LIBFLAGS) --def $(DLL_DEFFILE) --output-lib $@ + +endif # PORTNAME == cygwin || PORTNAME == win32 + +endif # enable_shared + + +# We need several not-quite-identical variants of .DEF files to build +# DLLs for Windows. These are made from the single source file +# exports.txt. Since we can't assume that Windows boxes will have +# sed, the .DEF files are always built and included in distribution +# tarballs. + +ifneq (,$(SHLIB_EXPORTS)) +distprep: $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def + +UC_NAME = $(shell echo $(NAME) | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') + +$(srcdir)/lib$(NAME)dll.def: $(SHLIB_EXPORTS) + echo '; DEF file for MS VC++' >$@ + echo 'LIBRARY LIB$(UC_NAME)' >>$@ + echo 'EXPORTS' >>$@ + sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@ + +$(srcdir)/lib$(NAME)ddll.def: $(SHLIB_EXPORTS) + echo '; DEF file for MS VC++' >$@ + echo 'LIBRARY LIB$(UC_NAME)D' >>$@ + echo 'EXPORTS' >>$@ + sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@ + +$(srcdir)/blib$(NAME)dll.def: $(SHLIB_EXPORTS) + echo '; DEF file for Borland C++ Builder' >$@ + echo 'LIBRARY BLIB$(UC_NAME)' >>$@ + echo 'EXPORTS' >>$@ + sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ _\1@ \2/' $< >>$@ + echo >>$@ + echo '; Aliases for MS compatible names' >> $@ + sed -e '/^#/d' -e 's/^\(.* \)\([0-9][0-9]*\)/ \1= _\1/' $< | sed 's/ *$$//' >>$@ +endif # SHLIB_EXPORTS + + +## +## INSTALL +## + +.PHONY: install-lib install-lib-static install-lib-shared installdirs-lib +install-lib: install-lib-shared +ifdef soname +install-lib: install-lib-static +endif + +install-lib-static: $(stlib) installdirs-lib + $(INSTALL_STLIB) $< '$(DESTDIR)$(libdir)/$(stlib)' +ifeq ($(PORTNAME), darwin) + cd '$(DESTDIR)$(libdir)' && \ + ranlib $(stlib) +endif + +ifeq ($(enable_shared), yes) +install-lib-shared: $(shlib) installdirs-lib +ifdef soname +# we don't install $(shlib) on AIX +# (see http://archives.postgresql.org/message-id/52EF20B2E3209443BC37736D00C3C1380A6E79FE@EXADV1.host.magwien.gv.at) +ifneq ($(PORTNAME), aix) + $(INSTALL_SHLIB) $< '$(DESTDIR)$(libdir)/$(shlib)' +ifneq ($(PORTNAME), cygwin) +ifneq ($(PORTNAME), win32) +ifneq ($(shlib), $(shlib_major)) + cd '$(DESTDIR)$(libdir)' && \ + rm -f $(shlib_major) && \ + $(LN_S) $(shlib) $(shlib_major) +endif +ifneq ($(shlib), $(shlib_bare)) + cd '$(DESTDIR)$(libdir)' && \ + rm -f $(shlib_bare) && \ + $(LN_S) $(shlib) $(shlib_bare) +endif +endif # not win32 +endif # not cygwin +endif # not aix +else # no soname + $(INSTALL_SHLIB) $< '$(DESTDIR)$(pkglibdir)/$(shlib)' +endif +else # not enable_shared +ifndef soname +install-lib-shared: + @echo "*****"; \ + echo "* Module $(NAME) was not installed due to lack of shared library support."; \ + echo "*****" +endif +endif # enable_shared + + +installdirs-lib: +ifdef soname + $(mkinstalldirs) '$(DESTDIR)$(libdir)' +else + $(mkinstalldirs) '$(DESTDIR)$(pkglibdir)' +endif + + +## +## UNINSTALL +## + +.PHONY: uninstall-lib +uninstall-lib: +ifdef soname + rm -f '$(DESTDIR)$(libdir)/$(stlib)' +ifeq ($(enable_shared), yes) + rm -f '$(DESTDIR)$(libdir)/$(shlib_bare)' \ + '$(DESTDIR)$(libdir)/$(shlib_major)' \ + '$(DESTDIR)$(libdir)/$(shlib)' +endif # enable_shared +else # no soname + rm -f '$(DESTDIR)$(pkglibdir)/$(shlib)' +endif # no soname + + +## +## CLEAN +## + +.PHONY: clean-lib +clean-lib: + rm -f $(shlib) $(shlib_bare) $(shlib_major) $(stlib) $(exports_file) + +ifneq (,$(SHLIB_EXPORTS)) +maintainer-clean-lib: + rm -f $(srcdir)/lib$(NAME)dll.def $(srcdir)/lib$(NAME)ddll.def $(srcdir)/blib$(NAME)dll.def +endif diff --git a/src/gtm/README b/src/gtm/README new file mode 100644 index 0000000000..77cff3695b --- /dev/null +++ b/src/gtm/README @@ -0,0 +1,61 @@ + +Global Transaction Manager (GTM) +-------------------------------- + +1. Source code layout: +---------------------- + +The server side code is located in the "include", "common" and +"main" directories. The "include" directory hosts all the header +files some of which are also shared by the client. + +The "common" directory contains the infrastructure pieces for the +server such as error reporting, memory management, locking etc. +Most of the server side logic including message processing, +transaction management, thread and connection management is hosted +in the "main" directory. + +The client side code is put in the "client" directory including all +client side infrastructure and test programs. + + +2. Building GTM Server and Clients: +----------------------------------- + +Go to the top level directory (where this README is located) and run +the make command to build the sources. + +$ make + +This would build the GTM server in the "main" directory and client +libraries in the "client" directory. + +You may want to change the following two defines in main/main.c + +#define GTM_DEFAULT_HOSTNAME "localhost" +#define GTM_DEFAULT_PORT 6666 + + +3. Running the GTM Server: +--------------------------- + +You can run the GTM server by running the following command from the +top level directory. + +$ ./main/gtm + +The server will start listening on port 6666 for incoming connections. + + +4. Building test clients: +------------------------- + +Go to the "client/test" directory and run make to build the test clients. + +$ cd client/test +$ make + +This would build various test clients, statically linking to the libgtmclient.a +library in the client directory. You may need to change the connect string +appropriately connect to the GTM server. + diff --git a/src/gtm/client/Makefile b/src/gtm/client/Makefile new file mode 100644 index 0000000000..216adf2207 --- /dev/null +++ b/src/gtm/client/Makefile @@ -0,0 +1,26 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + + +NAME=gtmclient +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 + +OBJS=fe-misc.o fe-connect.o pqexpbuffer.o ip.o strlcpy.o gtm_client.o fe-protocol.o +LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq + +LIBS=-lpthread + +all:all-lib + +include $(top_build_dir)/Makefile.shlib + +clean: + rm -f $(OBJS) + rm -f libgtmclient.a libgtmclient.so libgtmclient.so.1 libgtmclient.so.1.0 + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c new file mode 100644 index 0000000000..29d8fe4cc5 --- /dev/null +++ b/src/gtm/client/fe-connect.c @@ -0,0 +1,1287 @@ +/*------------------------------------------------------------------------- + * + * fe-connect.c + * functions related to setting up a connection to the backend + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/interfaces/libpq/fe-connect.c,v 1.371 2008/12/15 10:28:21 mha Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <ctype.h> +#include <time.h> +#include <unistd.h> + +#include "gtm/libpq-fe.h" +#include "gtm/libpq-int.h" + +#include <sys/socket.h> +#include <netdb.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "gtm/gtm_ip.h" +#include "gtm/gtm_msg.h" + +/* fall back options if they are not specified by arguments or defined + by environment variables */ +#define DefaultHost "localhost" + +/* ---------- + * Definition of the conninfo parameters and their fallback resources. + * + * GTMPQconninfoOptions[] is a constant static array that we use to initialize + * a dynamically allocated working copy. All the "val" fields in + * GTMPQconninfoOptions[] *must* be NULL. In a working copy, non-null "val" + * fields point to malloc'd strings that should be freed when the working + * array is freed (see GTMPQconninfoFree). + * ---------- + */ +static const GTMPQconninfoOption GTMPQconninfoOptions[] = { + {"connect_timeout", NULL}, + {"host", NULL}, + {"hostaddr", NULL}, + {"port", NULL}, + {"coordinator_id", NULL}, + {"proxy", NULL}, + /* Terminating entry --- MUST BE LAST */ + {NULL, NULL} +}; + +static bool connectOptions1(GTM_Conn *conn, const char *conninfo); +static int connectGTMStart(GTM_Conn *conn); +static int connectGTMComplete(GTM_Conn *conn); +static GTM_Conn *makeEmptyGTM_Conn(void); +static void freeGTM_Conn(GTM_Conn *conn); +static void closeGTM_Conn(GTM_Conn *conn); +static GTMPQconninfoOption *conninfo_parse(const char *conninfo, + PQExpBuffer errorMessage, bool use_defaults); +static char *conninfo_getval(GTMPQconninfoOption *connOptions, + const char *keyword); + +static int pqPacketSend(GTM_Conn *conn, char packet_type, + const void *buf, size_t buf_len); + +GTM_Conn * +PQconnectGTM(const char *conninfo) +{ + GTM_Conn *conn = PQconnectGTMStart(conninfo); + + if (conn && conn->status != CONNECTION_BAD) + (void) connectGTMComplete(conn); + + return conn; +} + +/* + * PQconnectGTMStart + * + * Returns a GTM_Conn*. If NULL is returned, a malloc error has occurred, and + * you should not attempt to proceed with this connection. If the status + * field of the connection returned is CONNECTION_BAD, an error has + * occurred. In this case you should call GTMPQfinish on the result, (perhaps + * inspecting the error message first). Other fields of the structure may not + * be valid if that occurs. If the status field is not CONNECTION_BAD, then + * this stage has succeeded - call GTMPQconnectPoll, using select(2) to see when + * this is necessary. + * + * See GTMPQconnectPoll for more info. + */ +GTM_Conn * +PQconnectGTMStart(const char *conninfo) +{ + GTM_Conn *conn; + + /* + * Allocate memory for the conn structure + */ + conn = makeEmptyGTM_Conn(); + if (conn == NULL) + return NULL; + + /* + * Parse the conninfo string + */ + if (!connectOptions1(conn, conninfo)) + return conn; + + /* + * Connect to the database + */ + if (!connectGTMStart(conn)) + { + /* Just in case we failed to set it in connectGTMStart */ + conn->status = CONNECTION_BAD; + } + + return conn; +} + +/* + * connectOptions1 + * + * Internal subroutine to set up connection parameters given an already- + * created GTM_Conn and a conninfo string. + * + * Returns true if OK, false if trouble (in which case errorMessage is set + * and so is conn->status). + */ +static bool +connectOptions1(GTM_Conn *conn, const char *conninfo) +{ + GTMPQconninfoOption *connOptions; + char *tmp; + + /* + * Parse the conninfo string + */ + connOptions = conninfo_parse(conninfo, &conn->errorMessage, true); + if (connOptions == NULL) + { + conn->status = CONNECTION_BAD; + /* errorMessage is already set */ + return false; + } + + /* + * Move option values into conn structure + * + * XXX: probably worth checking strdup() return value here... + */ + tmp = conninfo_getval(connOptions, "hostaddr"); + conn->pghostaddr = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "host"); + conn->pghost = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "port"); + conn->pgport = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "connect_timeout"); + conn->connect_timeout = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "coordinator_id"); + conn->coordinator_id = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "proxy"); + conn->is_proxy = tmp ? atoi(tmp) : 0; + + /* + * Free the option info - all is in conn now + */ + GTMPQconninfoFree(connOptions); + + return true; +} + + +/* ---------- + * connectNoDelay - + * Sets the TCP_NODELAY socket option. + * Returns 1 if successful, 0 if not. + * ---------- + */ +static int +connectNoDelay(GTM_Conn *conn) +{ +#ifdef TCP_NODELAY + int on = 1; + + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_NODELAY, + (char *) &on, + sizeof(on)) < 0) + { + appendGTMPQExpBuffer(&conn->errorMessage, + "could not set socket to TCP no delay mode: \n"); + return 0; + } +#endif + + return 1; +} + + +/* ---------- + * connectFailureMessage - + * create a friendly error message on connection failure. + * ---------- + */ +static void +connectFailureMessage(GTM_Conn *conn, int errorno) +{ + { + appendGTMPQExpBuffer(&conn->errorMessage, + "could not connect to server: \n" + "\tIs the server running on host \"%s\" and accepting\n" + "\tTCP/IP connections on port %s?\n", + conn->pghostaddr + ? conn->pghostaddr + : (conn->pghost + ? conn->pghost + : "???"), + conn->pgport); + } +} + + +/* ---------- + * connectGTMStart - + * Begin the process of making a connection to the backend. + * + * Returns 1 if successful, 0 if not. + * ---------- + */ +static int +connectGTMStart(GTM_Conn *conn) +{ + int portnum; + char portstr[128]; + struct addrinfo *addrs = NULL; + struct addrinfo hint; + const char *node; + int ret; + + if (!conn) + return 0; + + /* Ensure our buffers are empty */ + conn->inStart = conn->inCursor = conn->inEnd = 0; + conn->outCount = 0; + + /* + * Determine the parameters to pass to gtm_getaddrinfo_all. + */ + + /* Initialize hint structure */ + MemSet(&hint, 0, sizeof(hint)); + hint.ai_socktype = SOCK_STREAM; + hint.ai_family = AF_UNSPEC; + + /* Set up port number as a string */ + if (conn->pgport != NULL && conn->pgport[0] != '\0') + portnum = atoi(conn->pgport); + snprintf(portstr, sizeof(portstr), "%d", portnum); + + if (conn->pghostaddr != NULL && conn->pghostaddr[0] != '\0') + { + /* Using pghostaddr avoids a hostname lookup */ + node = conn->pghostaddr; + hint.ai_family = AF_UNSPEC; + hint.ai_flags = AI_NUMERICHOST; + } + else if (conn->pghost != NULL && conn->pghost[0] != '\0') + { + /* Using pghost, so we have to look-up the hostname */ + node = conn->pghost; + hint.ai_family = AF_UNSPEC; + } + else + { + /* Without Unix sockets, default to localhost instead */ + node = "localhost"; + hint.ai_family = AF_UNSPEC; + } + + /* Use gtm_getaddrinfo_all() to resolve the address */ + ret = gtm_getaddrinfo_all(node, portstr, &hint, &addrs); + if (ret || !addrs) + { + if (node) + appendGTMPQExpBuffer(&conn->errorMessage, + "could not translate host name \"%s\" to address: %s\n", + node, gai_strerror(ret)); + else + appendGTMPQExpBuffer(&conn->errorMessage, + "could not translate Unix-domain socket path \"%s\" to address: %s\n", + portstr, gai_strerror(ret)); + if (addrs) + gtm_freeaddrinfo_all(hint.ai_family, addrs); + goto connect_errReturn; + } + + /* + * Set up to try to connect, with protocol 3.0 as the first attempt. + */ + conn->addrlist = addrs; + conn->addr_cur = addrs; + conn->addrlist_family = hint.ai_family; + conn->status = CONNECTION_NEEDED; + + /* + * The code for processing CONNECTION_NEEDED state is in GTMPQconnectPoll(), + * so that it can easily be re-executed if needed again during the + * asynchronous startup process. However, we must run it once here, + * because callers expect a success return from this routine to mean that + * we are in PGRES_POLLING_WRITING connection state. + */ + if (GTMPQconnectPoll(conn) == PGRES_POLLING_WRITING) + return 1; + +connect_errReturn: + if (conn->sock >= 0) + { + close(conn->sock); + conn->sock = -1; + } + conn->status = CONNECTION_BAD; + return 0; +} + + +/* + * connectGTMComplete + * + * Block and complete a connection. + * + * Returns 1 on success, 0 on failure. + */ +static int +connectGTMComplete(GTM_Conn *conn) +{ + GTMClientPollingStatusType flag = PGRES_POLLING_WRITING; + time_t finish_time = ((time_t) -1); + + if (conn == NULL || conn->status == CONNECTION_BAD) + return 0; + + /* + * Set up a time limit, if connect_timeout isn't zero. + */ + if (conn->connect_timeout != NULL) + { + int timeout = atoi(conn->connect_timeout); + + if (timeout > 0) + { + /* + * Rounding could cause connection to fail; need at least 2 secs + */ + if (timeout < 2) + timeout = 2; + /* calculate the finish time based on start + timeout */ + finish_time = time(NULL) + timeout; + } + } + + for (;;) + { + /* + * Wait, if necessary. Note that the initial state (just after + * PQconnectGTMStart) is to wait for the socket to select for writing. + */ + switch (flag) + { + case PGRES_POLLING_OK: + /* Reset stored error messages since we now have a working connection */ + resetGTMPQExpBuffer(&conn->errorMessage); + return 1; /* success! */ + + case PGRES_POLLING_READING: + if (gtmpqWaitTimed(1, 0, conn, finish_time)) + { + conn->status = CONNECTION_BAD; + return 0; + } + break; + + case PGRES_POLLING_WRITING: + if (gtmpqWaitTimed(0, 1, conn, finish_time)) + { + conn->status = CONNECTION_BAD; + return 0; + } + break; + + default: + /* Just in case we failed to set it in GTMPQconnectPoll */ + conn->status = CONNECTION_BAD; + return 0; + } + + /* + * Now try to advance the state machine. + */ + flag = GTMPQconnectPoll(conn); + } +} + +/* ---------------- + * GTMPQconnectPoll + * + * Poll an asynchronous connection. + * + * Returns a GTMClientPollingStatusType. + * Before calling this function, use select(2) to determine when data + * has arrived.. + * + * You must call GTMPQfinish whether or not this fails. + */ +GTMClientPollingStatusType +GTMPQconnectPoll(GTM_Conn *conn) +{ + if (conn == NULL) + return PGRES_POLLING_FAILED; + + /* Get the new data */ + switch (conn->status) + { + /* + * We really shouldn't have been polled in these two cases, but we + * can handle it. + */ + case CONNECTION_BAD: + return PGRES_POLLING_FAILED; + case CONNECTION_OK: + return PGRES_POLLING_OK; + + /* These are reading states */ + case CONNECTION_AWAITING_RESPONSE: + case CONNECTION_AUTH_OK: + { + /* Load waiting data */ + int n = gtmpqReadData(conn); + + if (n < 0) + goto error_return; + if (n == 0) + return PGRES_POLLING_READING; + + break; + } + + /* These are writing states, so we just proceed. */ + case CONNECTION_STARTED: + case CONNECTION_MADE: + break; + + case CONNECTION_NEEDED: + break; + + default: + appendGTMPQExpBuffer(&conn->errorMessage, + "invalid connection state, " + "probably indicative of memory corruption\n" + ); + goto error_return; + } + + +keep_going: /* We will come back to here until there is + * nothing left to do. */ + switch (conn->status) + { + case CONNECTION_NEEDED: + { + /* + * Try to initiate a connection to one of the addresses + * returned by gtm_getaddrinfo_all(). conn->addr_cur is the + * next one to try. We fail when we run out of addresses + * (reporting the error returned for the *last* alternative, + * which may not be what users expect :-(). + */ + while (conn->addr_cur != NULL) + { + struct addrinfo *addr_cur = conn->addr_cur; + + /* Remember current address for possible error msg */ + memcpy(&conn->raddr.addr, addr_cur->ai_addr, + addr_cur->ai_addrlen); + conn->raddr.salen = addr_cur->ai_addrlen; + + /* Open a socket */ + conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0); + if (conn->sock < 0) + { + /* + * ignore socket() failure if we have more addresses + * to try + */ + if (addr_cur->ai_next != NULL) + { + conn->addr_cur = addr_cur->ai_next; + continue; + } + appendGTMPQExpBuffer(&conn->errorMessage, + "could not create socket: \n"); + break; + } + + /* + * Select socket options: no delay of outgoing data for + * TCP sockets, nonblock mode, close-on-exec. Fail if any + * of this fails. + */ + if (!IS_AF_UNIX(addr_cur->ai_family)) + { + if (!connectNoDelay(conn)) + { + close(conn->sock); + conn->sock = -1; + conn->addr_cur = addr_cur->ai_next; + continue; + } + } + + /* + * Start/make connection. This should not block, since we + * are in nonblock mode. If it does, well, too bad. + */ + if (connect(conn->sock, addr_cur->ai_addr, + addr_cur->ai_addrlen) < 0) + { + if (SOCK_ERRNO == EINPROGRESS || + SOCK_ERRNO == EWOULDBLOCK || + SOCK_ERRNO == EINTR || + SOCK_ERRNO == 0) + { + /* + * This is fine - we're in non-blocking mode, and + * the connection is in progress. Tell caller to + * wait for write-ready on socket. + */ + conn->status = CONNECTION_STARTED; + return PGRES_POLLING_WRITING; + } + /* otherwise, trouble */ + } + else + { + /* + * Hm, we're connected already --- seems the "nonblock + * connection" wasn't. Advance the state machine and + * go do the next stuff. + */ + conn->status = CONNECTION_STARTED; + goto keep_going; + } + + /* + * This connection failed --- set up error report, then + * close socket (do it this way in case close() affects + * the value of errno...). We will ignore the connect() + * failure and keep going if there are more addresses. + */ + connectFailureMessage(conn, SOCK_ERRNO); + if (conn->sock >= 0) + { + close(conn->sock); + conn->sock = -1; + } + + /* + * Try the next address, if any. + */ + conn->addr_cur = addr_cur->ai_next; + } /* loop over addresses */ + + /* + * Ooops, no more addresses. An appropriate error message is + * already set up, so just set the right status. + */ + goto error_return; + } + + case CONNECTION_STARTED: + { + int optval; + size_t optlen = sizeof(optval); + + /* + * Write ready, since we've made it here, so the connection + * has been made ... or has failed. + */ + + /* + * Now check (using getsockopt) that there is not an error + * state waiting for us on the socket. + */ + + if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, + (char *) &optval, &optlen) == -1) + { + appendGTMPQExpBuffer(&conn->errorMessage, + libpq_gettext("could not get socket error status: \n")); + goto error_return; + } + else if (optval != 0) + { + /* + * When using a nonblocking connect, we will typically see + * connect failures at this point, so provide a friendly + * error message. + */ + connectFailureMessage(conn, optval); + + /* + * If more addresses remain, keep trying, just as in the + * case where connect() returned failure immediately. + */ + if (conn->addr_cur->ai_next != NULL) + { + if (conn->sock >= 0) + { + close(conn->sock); + conn->sock = -1; + } + conn->addr_cur = conn->addr_cur->ai_next; + conn->status = CONNECTION_NEEDED; + goto keep_going; + } + goto error_return; + } + + /* Fill in the client address */ + conn->laddr.salen = sizeof(conn->laddr.addr); + if (getsockname(conn->sock, + (struct sockaddr *) & conn->laddr.addr, + &conn->laddr.salen) < 0) + { + appendGTMPQExpBuffer(&conn->errorMessage, + "could not get client address from socket:\n"); + goto error_return; + } + + /* + * Make sure we can write before advancing to next step. + */ + conn->status = CONNECTION_MADE; + return PGRES_POLLING_WRITING; + } + + case CONNECTION_MADE: + { + GTM_StartupPacket sp; + + /* + * Build a startup packet. We tell the GTM server/proxy our + * coordinator ID and whether we are a proxy or not. + * + * When the connection is made from the proxy, we let the GTM + * server know about it so that some special headers are + * handled correctly by the server. + */ + sp.sp_cid = atoi(conn->coordinator_id); + sp.sp_isproxy = conn->is_proxy; + + /* + * Send the startup packet. + * + * Theoretically, this could block, but it really shouldn't + * since we only got here if the socket is write-ready. + */ + if (pqPacketSend(conn, 'A', &sp, + sizeof (GTM_StartupPacket)) != STATUS_OK) + { + appendGTMPQExpBuffer(&conn->errorMessage, + "could not send startup packet: \n"); + goto error_return; + } + + conn->status = CONNECTION_AWAITING_RESPONSE; + return PGRES_POLLING_READING; + } + + /* + * Handle authentication exchange: wait for postmaster messages + * and respond as necessary. + */ + case CONNECTION_AWAITING_RESPONSE: + { + char beresp; + + /* + * Scan the message from current point (note that if we find + * the message is incomplete, we will return without advancing + * inStart, and resume here next time). + */ + conn->inCursor = conn->inStart; + + /* Read type byte */ + if (gtmpqGetc(&beresp, conn)) + { + /* We'll come back when there is more data */ + return PGRES_POLLING_READING; + } + + /* + * Validate message type: we expect only an authentication + * request or an error here. Anything else probably means + * it's not GTM on the other end at all. + */ + if (!(beresp == 'R' || beresp == 'E')) + { + appendGTMPQExpBuffer(&conn->errorMessage, + "expected authentication request from " + "server, but received %c\n", + beresp); + goto error_return; + } + + + /* Handle errors. */ + if (beresp == 'E') + { + if (gtmpqGets_append(&conn->errorMessage, conn)) + { + /* We'll come back when there is more data */ + return PGRES_POLLING_READING; + } + /* OK, we read the message; mark data consumed */ + conn->inStart = conn->inCursor; + goto error_return; + } + + { + /* + * Server sends a dummy message body of size 4 bytes + */ + int tmp_int; + gtmpqGetInt(&tmp_int, 4, conn); + } + + /* + * OK, we successfully read the message; mark data consumed + */ + conn->inStart = conn->inCursor; + + /* We are done with authentication exchange */ + conn->status = CONNECTION_AUTH_OK; + + /* Look to see if we have more data yet. */ + goto keep_going; + } + + case CONNECTION_AUTH_OK: + { + /* We can release the address list now. */ + gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist); + conn->addrlist = NULL; + conn->addr_cur = NULL; + + /* Otherwise, we are open for business! */ + conn->status = CONNECTION_OK; + return PGRES_POLLING_OK; + } + + + default: + appendGTMPQExpBuffer(&conn->errorMessage, + "invalid connection state %c, " + "probably indicative of memory corruption\n" + , + conn->status); + goto error_return; + } + + /* Unreachable */ + +error_return: + + /* + * We used to close the socket at this point, but that makes it awkward + * for those above us if they wish to remove this socket from their own + * records (an fd_set for example). We'll just have this socket closed + * when GTMPQfinish is called (which is compulsory even after an error, since + * the connection structure must be freed). + */ + conn->status = CONNECTION_BAD; + return PGRES_POLLING_FAILED; +} + + +/* + * makeEmptyGTM_Conn + * - create a GTM_Conn data structure with (as yet) no interesting data + */ +static GTM_Conn * +makeEmptyGTM_Conn(void) +{ + GTM_Conn *conn; + + conn = (GTM_Conn *) malloc(sizeof(GTM_Conn)); + if (conn == NULL) + return conn; + + /* Zero all pointers and booleans */ + MemSet(conn, 0, sizeof(GTM_Conn)); + + conn->status = CONNECTION_BAD; + + /* + * We try to send at least 8K at a time, which is the usual size of pipe + * buffers on Unix systems. That way, when we are sending a large amount + * of data, we avoid incurring extra kernel context swaps for partial + * bufferloads. The output buffer is initially made 16K in size, and we + * try to dump it after accumulating 8K. + * + * With the same goal of minimizing context swaps, the input buffer will + * be enlarged anytime it has less than 8K free, so we initially allocate + * twice that. + */ + conn->inBufSize = 16 * 1024; + conn->inBuffer = (char *) malloc(conn->inBufSize); + conn->outBufSize = 16 * 1024; + conn->outBuffer = (char *) malloc(conn->outBufSize); + initGTMPQExpBuffer(&conn->errorMessage); + initGTMPQExpBuffer(&conn->workBuffer); + + if (conn->inBuffer == NULL || + conn->outBuffer == NULL || + PQExpBufferBroken(&conn->errorMessage) || + PQExpBufferBroken(&conn->workBuffer)) + { + /* out of memory already :-( */ + freeGTM_Conn(conn); + conn = NULL; + } + + return conn; +} + +/* + * freeGTM_Conn + * - free an idle (closed) GTM_Conn data structure + * + * NOTE: this should not overlap any functionality with closeGTM_Conn(). + * Clearing/resetting of transient state belongs there; what we do here is + * release data that is to be held for the life of the GTM_Conn structure. + * If a value ought to be cleared/freed during PQreset(), do it there not here. + */ +static void +freeGTM_Conn(GTM_Conn *conn) +{ + if (conn->pghost) + free(conn->pghost); + if (conn->pghostaddr) + free(conn->pghostaddr); + if (conn->pgport) + free(conn->pgport); + if (conn->connect_timeout) + free(conn->connect_timeout); + if (conn->inBuffer) + free(conn->inBuffer); + if (conn->outBuffer) + free(conn->outBuffer); + termGTMPQExpBuffer(&conn->errorMessage); + termGTMPQExpBuffer(&conn->workBuffer); + + free(conn); +} + +/* + * closeGTM_Conn + * - properly close a connection to the backend + * + * This should reset or release all transient state, but NOT the connection + * parameters. On exit, the GTM_Conn should be in condition to start a fresh + * connection with the same parameters (see PQreset()). + */ +static void +closeGTM_Conn(GTM_Conn *conn) +{ + /* + * Note that the protocol doesn't allow us to send Terminate messages + * during the startup phase. + */ + if (conn->sock >= 0 && conn->status == CONNECTION_OK) + { + /* + * Try to send "close connection" message to backend. Ignore any + * error. + * + * Force length word for backends may try to read that in a generic + * code + */ + gtmpqPutMsgStart('X', true, conn); + gtmpqPutMsgEnd(conn); + gtmpqFlush(conn); + } + + /* + * Close the connection, reset all transient state, flush I/O buffers. + */ + if (conn->sock >= 0) + close(conn->sock); + conn->sock = -1; + conn->status = CONNECTION_BAD; /* Well, not really _bad_ - just + * absent */ + gtm_freeaddrinfo_all(conn->addrlist_family, conn->addrlist); + conn->addrlist = NULL; + conn->addr_cur = NULL; + conn->inStart = conn->inCursor = conn->inEnd = 0; + conn->outCount = 0; +} + +/* + * GTMPQfinish: properly close a connection to the backend. Also frees + * the GTM_Conn data structure so it shouldn't be re-used after this. + */ +void +GTMPQfinish(GTM_Conn *conn) +{ + if (conn) + { + closeGTM_Conn(conn); + freeGTM_Conn(conn); + } +} + +/* + * pqPacketSend() -- convenience routine to send a message to server. + * + * pack_type: the single-byte message type code. (Pass zero for startup + * packets, which have no message type code.) + * + * buf, buf_len: contents of message. The given length includes only what + * is in buf; the message type and message length fields are added here. + * + * RETURNS: STATUS_ERROR if the write fails, STATUS_OK otherwise. + * SIDE_EFFECTS: may block. + * + * Note: all messages sent with this routine have a length word, whether + * it's protocol 2.0 or 3.0. + */ +static int +pqPacketSend(GTM_Conn *conn, char pack_type, + const void *buf, size_t buf_len) +{ + /* Start the message. */ + if (gtmpqPutMsgStart(pack_type, true, conn)) + return STATUS_ERROR; + + /* Send the message body. */ + if (gtmpqPutnchar(buf, buf_len, conn)) + return STATUS_ERROR; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + return STATUS_ERROR; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + return STATUS_ERROR; + + return STATUS_OK; +} + + +/* + * GTMPQconninfoParse + * + * Parse a string like PQconnectGTM() would do and return the + * resulting connection options array. NULL is returned on failure. + * The result contains only options specified directly in the string, + * not any possible default values. + * + * If errmsg isn't NULL, *errmsg is set to NULL on success, or a malloc'd + * string on failure (use PQfreemem to free it). In out-of-memory conditions + * both *errmsg and the result could be NULL. + * + * NOTE: the returned array is dynamically allocated and should + * be freed when no longer needed via GTMPQconninfoFree(). + */ +GTMPQconninfoOption * +GTMPQconninfoParse(const char *conninfo, char **errmsg) +{ + PQExpBufferData errorBuf; + GTMPQconninfoOption *connOptions; + + if (errmsg) + *errmsg = NULL; /* default */ + initGTMPQExpBuffer(&errorBuf); + if (PQExpBufferBroken(&errorBuf)) + return NULL; /* out of memory already :-( */ + connOptions = conninfo_parse(conninfo, &errorBuf, false); + if (connOptions == NULL && errmsg) + *errmsg = errorBuf.data; + else + termGTMPQExpBuffer(&errorBuf); + return connOptions; +} + +/* + * Conninfo parser routine + * + * If successful, a malloc'd GTMPQconninfoOption array is returned. + * If not successful, NULL is returned and an error message is + * left in errorMessage. + * Defaults are supplied (from a service file, environment variables, etc) + * for unspecified options, but only if use_defaults is TRUE. + */ +static GTMPQconninfoOption * +conninfo_parse(const char *conninfo, PQExpBuffer errorMessage, + bool use_defaults) +{ + char *pname; + char *pval; + char *buf; + char *cp; + char *cp2; + GTMPQconninfoOption *options; + GTMPQconninfoOption *option; + + /* Make a working copy of GTMPQconninfoOptions */ + options = malloc(sizeof(GTMPQconninfoOptions)); + if (options == NULL) + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("out of memory\n")); + return NULL; + } + memcpy(options, GTMPQconninfoOptions, sizeof(GTMPQconninfoOptions)); + + /* Need a modifiable copy of the input string */ + if ((buf = strdup(conninfo)) == NULL) + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("out of memory\n")); + GTMPQconninfoFree(options); + return NULL; + } + cp = buf; + + while (*cp) + { + /* Skip blanks before the parameter name */ + if (isspace((unsigned char) *cp)) + { + cp++; + continue; + } + + /* Get the parameter name */ + pname = cp; + while (*cp) + { + if (*cp == '=') + break; + if (isspace((unsigned char) *cp)) + { + *cp++ = '\0'; + while (*cp) + { + if (!isspace((unsigned char) *cp)) + break; + cp++; + } + break; + } + cp++; + } + + /* Check that there is a following '=' */ + if (*cp != '=') + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("missing \"=\" after \"%s\" in connection info string\n"), + pname); + GTMPQconninfoFree(options); + free(buf); + return NULL; + } + *cp++ = '\0'; + + /* Skip blanks after the '=' */ + while (*cp) + { + if (!isspace((unsigned char) *cp)) + break; + cp++; + } + + /* Get the parameter value */ + pval = cp; + + if (*cp != '\'') + { + cp2 = pval; + while (*cp) + { + if (isspace((unsigned char) *cp)) + { + *cp++ = '\0'; + break; + } + if (*cp == '\\') + { + cp++; + if (*cp != '\0') + *cp2++ = *cp++; + } + else + *cp2++ = *cp++; + } + *cp2 = '\0'; + } + else + { + cp2 = pval; + cp++; + for (;;) + { + if (*cp == '\0') + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("unterminated quoted string in connection info string\n")); + GTMPQconninfoFree(options); + free(buf); + return NULL; + } + if (*cp == '\\') + { + cp++; + if (*cp != '\0') + *cp2++ = *cp++; + continue; + } + if (*cp == '\'') + { + *cp2 = '\0'; + cp++; + break; + } + *cp2++ = *cp++; + } + } + + /* + * Now we have the name and the value. Search for the param record. + */ + for (option = options; option->keyword != NULL; option++) + { + if (strcmp(option->keyword, pname) == 0) + break; + } + if (option->keyword == NULL) + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("invalid connection option \"%s\"\n"), + pname); + GTMPQconninfoFree(options); + free(buf); + return NULL; + } + + /* + * Store the value + */ + if (option->val) + free(option->val); + option->val = strdup(pval); + if (!option->val) + { + printfGTMPQExpBuffer(errorMessage, + libpq_gettext("out of memory\n")); + GTMPQconninfoFree(options); + free(buf); + return NULL; + } + } + + /* Done with the modifiable input string */ + free(buf); + + return options; +} + + +static char * +conninfo_getval(GTMPQconninfoOption *connOptions, + const char *keyword) +{ + GTMPQconninfoOption *option; + + for (option = connOptions; option->keyword != NULL; option++) + { + if (strcmp(option->keyword, keyword) == 0) + return option->val; + } + + return NULL; +} + + +void +GTMPQconninfoFree(GTMPQconninfoOption *connOptions) +{ + GTMPQconninfoOption *option; + + if (connOptions == NULL) + return; + + for (option = connOptions; option->keyword != NULL; option++) + { + if (option->val != NULL) + free(option->val); + } + free(connOptions); +} + +char * +GTMPQhost(const GTM_Conn *conn) +{ + if (!conn) + return NULL; + return conn->pghost; +} + +char * +GTMPQport(const GTM_Conn *conn) +{ + if (!conn) + return NULL; + return conn->pgport; +} + +ConnStatusType +GTMPQstatus(const GTM_Conn *conn) +{ + if (!conn) + return CONNECTION_BAD; + return conn->status; +} + +char * +GTMPQerrorMessage(const GTM_Conn *conn) +{ + if (!conn) + return libpq_gettext("connection pointer is NULL\n"); + + return conn->errorMessage.data; +} + +int +GTMPQsocket(const GTM_Conn *conn) +{ + if (!conn) + return -1; + return conn->sock; +} + +void +GTMPQtrace(GTM_Conn *conn, FILE *debug_port) +{ + if (conn == NULL) + return; + GTMPQuntrace(conn); + conn->Pfdebug = debug_port; +} + +void +GTMPQuntrace(GTM_Conn *conn) +{ + if (conn == NULL) + return; + if (conn->Pfdebug) + { + fflush(conn->Pfdebug); + conn->Pfdebug = NULL; + } +} diff --git a/src/gtm/client/fe-misc.c b/src/gtm/client/fe-misc.c new file mode 100644 index 0000000000..66172400a5 --- /dev/null +++ b/src/gtm/client/fe-misc.c @@ -0,0 +1,1035 @@ +/*------------------------------------------------------------------------- + * + * FILE + * fe-misc.c + * + * DESCRIPTION + * miscellaneous useful functions + * + * The communication routines here are analogous to the ones in + * backend/libpq/pqcomm.c and backend/libpq/pqcomprim.c, but operate + * in the considerably different environment of the frontend libpq. + * In particular, we work with a bare nonblock-mode socket, rather than + * a stdio stream, so that we can avoid unwanted blocking of the application. + * + * XXX: MOVE DEBUG PRINTOUT TO HIGHER LEVEL. As is, block and restart + * will cause repeat printouts. + * + * We must speak the same transmitted data representations as the backend + * routines. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/interfaces/libpq/fe-misc.c,v 1.137 2008/12/11 07:34:09 petere Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" + +#include <signal.h> +#include <time.h> + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <unistd.h> +#include <sys/time.h> + +#include <poll.h> +#include <sys/poll.h> +#include <sys/select.h> + +#include "gtm/libpq-fe.h" +#include "gtm/libpq-int.h" + + +static int gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn); +static int gtmpqSendSome(GTM_Conn *conn, int len); +static int gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite, + time_t end_time); +static int gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time); + + +/* + * gtmpqGetc: get 1 character from the connection + * + * All these routines return 0 on success, EOF on error. + * Note that for the Get routines, EOF only means there is not enough + * data in the buffer, not that there is necessarily a hard error. + */ +int +gtmpqGetc(char *result, GTM_Conn *conn) +{ + if (conn->inCursor >= conn->inEnd) + return EOF; + + *result = conn->inBuffer[conn->inCursor++]; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "From backend> %c\n", *result); + + return 0; +} + + +/* + * gtmpqPutc: write 1 char to the current message + */ +int +gtmpqPutc(char c, GTM_Conn *conn) +{ + if (gtmpqPutMsgBytes(&c, 1, conn)) + return EOF; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend> %c\n", c); + + return 0; +} + + +/* + * gtmpqGets[_append]: + * get a null-terminated string from the connection, + * and store it in an expansible PQExpBuffer. + * If we run out of memory, all of the string is still read, + * but the excess characters are silently discarded. + */ +static int +gtmpqGets_internal(PQExpBuffer buf, GTM_Conn *conn, bool resetbuffer) +{ + /* Copy conn data to locals for faster search loop */ + char *inBuffer = conn->inBuffer; + int inCursor = conn->inCursor; + int inEnd = conn->inEnd; + int slen; + + while (inCursor < inEnd && inBuffer[inCursor]) + inCursor++; + + if (inCursor >= inEnd) + return EOF; + + slen = inCursor - conn->inCursor; + + if (resetbuffer) + resetGTMPQExpBuffer(buf); + + appendBinaryGTMPQExpBuffer(buf, inBuffer + conn->inCursor, slen); + + conn->inCursor = ++inCursor; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "From backend> \"%s\"\n", + buf->data); + + return 0; +} + +int +gtmpqGets(PQExpBuffer buf, GTM_Conn *conn) +{ + return gtmpqGets_internal(buf, conn, true); +} + +int +gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn) +{ + return gtmpqGets_internal(buf, conn, false); +} + + +/* + * gtmpqPuts: write a null-terminated string to the current message + */ +int +gtmpqPuts(const char *s, GTM_Conn *conn) +{ + if (gtmpqPutMsgBytes(s, strlen(s) + 1, conn)) + return EOF; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend> \"%s\"\n", s); + + return 0; +} + +/* + * gtmpqGetnchar: + * get a string of exactly len bytes in buffer s, no null termination + */ +int +gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn) +{ + if (len < 0 || len > (size_t) (conn->inEnd - conn->inCursor)) + return EOF; + + memcpy(s, conn->inBuffer + conn->inCursor, len); + /* no terminating null */ + + conn->inCursor += len; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "From backend (%lu)> %.*s\n", + (unsigned long) len, (int) len, s); + + return 0; +} + +/* + * gtmpqPutnchar: + * write exactly len bytes to the current message + */ +int +gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn) +{ + if (gtmpqPutMsgBytes(s, len, conn)) + return EOF; + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend> %.*s\n", (int) len, s); + + return 0; +} + +/* + * gtmpqGetInt + * read a 2 or 4 byte integer and convert from network byte order + * to local byte order + */ +int +gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn) +{ + uint16 tmp2; + uint32 tmp4; + + switch (bytes) + { + case 2: + if (conn->inCursor + 2 > conn->inEnd) + return EOF; + memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2); + conn->inCursor += 2; + *result = (int) ntohs(tmp2); + break; + case 4: + if (conn->inCursor + 4 > conn->inEnd) + return EOF; + memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4); + conn->inCursor += 4; + *result = (int) ntohl(tmp4); + break; + default: + fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes); + return EOF; + } + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "From backend (#%lu)> %d\n", (unsigned long) bytes, *result); + + return 0; +} + +/* + * gtmpqPutInt + * write an integer of 2 or 4 bytes, converting from host byte order + * to network byte order. + */ +int +gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn) +{ + uint16 tmp2; + uint32 tmp4; + + switch (bytes) + { + case 2: + tmp2 = htons((uint16) value); + if (gtmpqPutMsgBytes((const char *) &tmp2, 2, conn)) + return EOF; + break; + case 4: + tmp4 = htonl((uint32) value); + if (gtmpqPutMsgBytes((const char *) &tmp4, 4, conn)) + return EOF; + break; + default: + fprintf(conn->Pfdebug, "Integer size of (%d) bytes not supported", bytes); + return EOF; + } + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend (%lu#)> %d\n", (unsigned long) bytes, value); + + return 0; +} + +/* + * Make sure conn's output buffer can hold bytes_needed bytes (caller must + * include already-stored data into the value!) + * + * Returns 0 on success, EOF if failed to enlarge buffer + */ +int +gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn) +{ + int newsize = conn->outBufSize; + char *newbuf; + + if (bytes_needed <= (size_t) newsize) + return 0; + + /* + * If we need to enlarge the buffer, we first try to double it in size; if + * that doesn't work, enlarge in multiples of 8K. This avoids thrashing + * the malloc pool by repeated small enlargements. + * + * Note: tests for newsize > 0 are to catch integer overflow. + */ + do + { + newsize *= 2; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = realloc(conn->outBuffer, newsize); + if (newbuf) + { + /* realloc succeeded */ + conn->outBuffer = newbuf; + conn->outBufSize = newsize; + return 0; + } + } + + newsize = conn->outBufSize; + do + { + newsize += 8192; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = realloc(conn->outBuffer, newsize); + if (newbuf) + { + /* realloc succeeded */ + conn->outBuffer = newbuf; + conn->outBufSize = newsize; + return 0; + } + } + + /* realloc failed. Probably out of memory */ + printfGTMPQExpBuffer(&conn->errorMessage, + "cannot allocate memory for output buffer\n"); + return EOF; +} + +/* + * Make sure conn's input buffer can hold bytes_needed bytes (caller must + * include already-stored data into the value!) + * + * Returns 0 on success, EOF if failed to enlarge buffer + */ +int +gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn) +{ + int newsize = conn->inBufSize; + char *newbuf; + + if (bytes_needed <= (size_t) newsize) + return 0; + + /* + * If we need to enlarge the buffer, we first try to double it in size; if + * that doesn't work, enlarge in multiples of 8K. This avoids thrashing + * the malloc pool by repeated small enlargements. + * + * Note: tests for newsize > 0 are to catch integer overflow. + */ + do + { + newsize *= 2; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = realloc(conn->inBuffer, newsize); + if (newbuf) + { + /* realloc succeeded */ + conn->inBuffer = newbuf; + conn->inBufSize = newsize; + return 0; + } + } + + newsize = conn->inBufSize; + do + { + newsize += 8192; + } while (newsize > 0 && bytes_needed > (size_t) newsize); + + if (newsize > 0 && bytes_needed <= (size_t) newsize) + { + newbuf = realloc(conn->inBuffer, newsize); + if (newbuf) + { + /* realloc succeeded */ + conn->inBuffer = newbuf; + conn->inBufSize = newsize; + return 0; + } + } + + /* realloc failed. Probably out of memory */ + printfGTMPQExpBuffer(&conn->errorMessage, + "cannot allocate memory for input buffer\n"); + return EOF; +} + +/* + * gtmpqPutMsgStart: begin construction of a message to the server + * + * msg_type is the message type byte, or 0 for a message without type byte + * (only startup messages have no type byte) + * + * force_len forces the message to have a length word; otherwise, we add + * a length word if protocol 3. + * + * Returns 0 on success, EOF on error + * + * The idea here is that we construct the message in conn->outBuffer, + * beginning just past any data already in outBuffer (ie, at + * outBuffer+outCount). We enlarge the buffer as needed to hold the message. + * When the message is complete, we fill in the length word (if needed) and + * then advance outCount past the message, making it eligible to send. + * + * The state variable conn->outMsgStart points to the incomplete message's + * length word: it is either outCount or outCount+1 depending on whether + * there is a type byte. If we are sending a message without length word + * (pre protocol 3.0 only), then outMsgStart is -1. The state variable + * conn->outMsgEnd is the end of the data collected so far. + */ +int +gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn) +{ + int lenPos; + int endPos; + + /* allow room for message type byte */ + if (msg_type) + endPos = conn->outCount + 1; + else + endPos = conn->outCount; + + /* do we want a length word? */ + if (force_len) + { + lenPos = endPos; + /* allow room for message length */ + endPos += 4; + } + else + lenPos = -1; + + /* make sure there is room for message header */ + if (gtmpqCheckOutBufferSpace(endPos, conn)) + return EOF; + /* okay, save the message type byte if any */ + if (msg_type) + conn->outBuffer[conn->outCount] = msg_type; + /* set up the message pointers */ + conn->outMsgStart = lenPos; + conn->outMsgEnd = endPos; + /* length word, if needed, will be filled in by gtmpqPutMsgEnd */ + + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend> Msg %c\n", + msg_type ? msg_type : ' '); + + return 0; +} + +/* + * gtmpqPutMsgBytes: add bytes to a partially-constructed message + * + * Returns 0 on success, EOF on error + */ +static int +gtmpqPutMsgBytes(const void *buf, size_t len, GTM_Conn *conn) +{ + /* make sure there is room for it */ + if (gtmpqCheckOutBufferSpace(conn->outMsgEnd + len, conn)) + return EOF; + /* okay, save the data */ + memcpy(conn->outBuffer + conn->outMsgEnd, buf, len); + conn->outMsgEnd += len; + /* no Pfdebug call here, caller should do it */ + return 0; +} + +/* + * gtmpqPutMsgEnd: finish constructing a message and possibly send it + * + * Returns 0 on success, EOF on error + * + * We don't actually send anything here unless we've accumulated at least + * 8K worth of data (the typical size of a pipe buffer on Unix systems). + * This avoids sending small partial packets. The caller must use gtmpqFlush + * when it's important to flush all the data out to the server. + */ +int +gtmpqPutMsgEnd(GTM_Conn *conn) +{ + if (conn->Pfdebug) + fprintf(conn->Pfdebug, "To backend> Msg complete, length %u\n", + conn->outMsgEnd - conn->outCount); + + /* Fill in length word if needed */ + if (conn->outMsgStart >= 0) + { + uint32 msgLen = conn->outMsgEnd - conn->outMsgStart; + + msgLen = htonl(msgLen); + memcpy(conn->outBuffer + conn->outMsgStart, &msgLen, 4); + } + + /* Make message eligible to send */ + conn->outCount = conn->outMsgEnd; + + if (conn->outCount >= 8192) + { + int toSend = conn->outCount - (conn->outCount % 8192); + + if (gtmpqSendSome(conn, toSend) < 0) + return EOF; + /* in nonblock mode, don't complain if unable to send it all */ + } + + return 0; +} + +/* ---------- + * gtmpqReadData: read more data, if any is available + * Possible return values: + * 1: successfully loaded at least one more byte + * 0: no data is presently available, but no error detected + * -1: error detected (including EOF = connection closure); + * conn->errorMessage set + * NOTE: callers must not assume that pointers or indexes into conn->inBuffer + * remain valid across this call! + * ---------- + */ +int +gtmpqReadData(GTM_Conn *conn) +{ + int someread = 0; + int nread; + + if (conn->sock < 0) + { + printfGTMPQExpBuffer(&conn->errorMessage, + "connection not open\n"); + return -1; + } + + /* Left-justify any data in the buffer to make room */ + if (conn->inStart < conn->inEnd) + { + if (conn->inStart > 0) + { + memmove(conn->inBuffer, conn->inBuffer + conn->inStart, + conn->inEnd - conn->inStart); + conn->inEnd -= conn->inStart; + conn->inCursor -= conn->inStart; + conn->inStart = 0; + } + } + else + { + /* buffer is logically empty, reset it */ + conn->inStart = conn->inCursor = conn->inEnd = 0; + } + + /* + * If the buffer is fairly full, enlarge it. We need to be able to enlarge + * the buffer in case a single message exceeds the initial buffer size. We + * enlarge before filling the buffer entirely so as to avoid asking the + * kernel for a partial packet. The magic constant here should be large + * enough for a TCP packet or Unix pipe bufferload. 8K is the usual pipe + * buffer size, so... + */ + if (conn->inBufSize - conn->inEnd < 8192) + { + if (gtmpqCheckInBufferSpace(conn->inEnd + (size_t) 8192, conn)) + { + /* + * We don't insist that the enlarge worked, but we need some room + */ + if (conn->inBufSize - conn->inEnd < 100) + return -1; /* errorMessage already set */ + } + } + + /* OK, try to read some data */ +retry3: + nread = recv(conn->sock, conn->inBuffer + conn->inEnd, + conn->inBufSize - conn->inEnd, 0); + if (nread < 0) + { + if (SOCK_ERRNO == EINTR) + goto retry3; + /* Some systems return EAGAIN/EWOULDBLOCK for no data */ +#ifdef EAGAIN + if (SOCK_ERRNO == EAGAIN) + return someread; +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + if (SOCK_ERRNO == EWOULDBLOCK) + return someread; +#endif + /* We might get ECONNRESET here if using TCP and backend died */ +#ifdef ECONNRESET + if (SOCK_ERRNO == ECONNRESET) + goto definitelyFailed; +#endif + printfGTMPQExpBuffer(&conn->errorMessage, + "could not receive data from server:\n"); + return -1; + } + if (nread > 0) + { + conn->inEnd += nread; + + /* + * Hack to deal with the fact that some kernels will only give us back + * 1 packet per recv() call, even if we asked for more and there is + * more available. If it looks like we are reading a long message, + * loop back to recv() again immediately, until we run out of data or + * buffer space. Without this, the block-and-restart behavior of + * libpq's higher levels leads to O(N^2) performance on long messages. + * + * Since we left-justified the data above, conn->inEnd gives the + * amount of data already read in the current message. We consider + * the message "long" once we have acquired 32k ... + */ + if (conn->inEnd > 32768 && + (conn->inBufSize - conn->inEnd) >= 8192) + { + someread = 1; + goto retry3; + } + return 1; + } + + if (someread) + return 1; /* got a zero read after successful tries */ + + /* + * A return value of 0 could mean just that no data is now available, or + * it could mean EOF --- that is, the server has closed the connection. + * Since we have the socket in nonblock mode, the only way to tell the + * difference is to see if select() is saying that the file is ready. + * Grumble. Fortunately, we don't expect this path to be taken much, + * since in normal practice we should not be trying to read data unless + * the file selected for reading already. + * + * In SSL mode it's even worse: SSL_read() could say WANT_READ and then + * data could arrive before we make the gtmpqReadReady() test. So we must + * play dumb and assume there is more data, relying on the SSL layer to + * detect true EOF. + */ + +#ifdef USE_SSL + if (conn->ssl) + return 0; +#endif + + switch (gtmpqReadReady(conn)) + { + case 0: + /* definitely no data available */ + return 0; + case 1: + /* ready for read */ + break; + default: + goto definitelyFailed; + } + + /* + * Still not sure that it's EOF, because some data could have just + * arrived. + */ +retry4: + nread = recv(conn->sock, conn->inBuffer + conn->inEnd, + conn->inBufSize - conn->inEnd, 0); + if (nread < 0) + { + if (SOCK_ERRNO == EINTR) + goto retry4; + /* Some systems return EAGAIN/EWOULDBLOCK for no data */ +#ifdef EAGAIN + if (SOCK_ERRNO == EAGAIN) + return 0; +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + if (SOCK_ERRNO == EWOULDBLOCK) + return 0; +#endif + /* We might get ECONNRESET here if using TCP and backend died */ +#ifdef ECONNRESET + if (SOCK_ERRNO == ECONNRESET) + goto definitelyFailed; +#endif + printfGTMPQExpBuffer(&conn->errorMessage, + "could not receive data from server: \n"); + return -1; + } + if (nread > 0) + { + conn->inEnd += nread; + return 1; + } + + /* + * OK, we are getting a zero read even though select() says ready. This + * means the connection has been closed. Cope. + */ +definitelyFailed: + printfGTMPQExpBuffer(&conn->errorMessage, + "server closed the connection unexpectedly\n" + "\tThis probably means the server terminated abnormally\n" + "\tbefore or while processing the request.\n"); + conn->status = CONNECTION_BAD; /* No more connection to backend */ + close(conn->sock); + conn->sock = -1; + + return -1; +} + +/* + * gtmpqSendSome: send data waiting in the output buffer. + * + * len is how much to try to send (typically equal to outCount, but may + * be less). + * + * Return 0 on success, -1 on failure and 1 when not all data could be sent + * because the socket would block and the connection is non-blocking. + */ +static int +gtmpqSendSome(GTM_Conn *conn, int len) +{ + char *ptr = conn->outBuffer; + int remaining = conn->outCount; + int result = 0; + + if (conn->sock < 0) + { + printfGTMPQExpBuffer(&conn->errorMessage, + "connection not open\n"); + return -1; + } + + /* while there's still data to send */ + while (len > 0) + { + int sent; + + sent = send(conn->sock, ptr, len, 0); + + if (sent < 0) + { + /* + * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's + * EPIPE or ECONNRESET, assume we've lost the backend connection + * permanently. + */ + switch (SOCK_ERRNO) + { +#ifdef EAGAIN + case EAGAIN: + break; +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + case EWOULDBLOCK: + break; +#endif + case EINTR: + continue; + + case EPIPE: +#ifdef ECONNRESET + case ECONNRESET: +#endif + printfGTMPQExpBuffer(&conn->errorMessage, + "server closed the connection unexpectedly\n" + "\tThis probably means the server terminated abnormally\n" + "\tbefore or while processing the request.\n"); + + /* + * We used to close the socket here, but that's a bad idea + * since there might be unread data waiting (typically, a + * NOTICE message from the backend telling us it's + * committing hara-kiri...). Leave the socket open until + * gtmpqReadData finds no more data can be read. But abandon + * attempt to send data. + */ + conn->outCount = 0; + return -1; + + default: + printfGTMPQExpBuffer(&conn->errorMessage, + "could not send data to server: \n"); + /* We don't assume it's a fatal error... */ + conn->outCount = 0; + return -1; + } + } + else + { + ptr += sent; + len -= sent; + remaining -= sent; + } + + if (len > 0) + { + /* + * We didn't send it all, wait till we can send more. + * + * If the connection is in non-blocking mode we don't wait, but + * return 1 to indicate that data is still pending. + */ + result = 1; + break; + } + } + + /* shift the remaining contents of the buffer */ + if (remaining > 0) + memmove(conn->outBuffer, ptr, remaining); + conn->outCount = remaining; + + return result; +} + + +/* + * gtmpqFlush: send any data waiting in the output buffer + * + * Return 0 on success, -1 on failure and 1 when not all data could be sent + * because the socket would block and the connection is non-blocking. + */ +int +gtmpqFlush(GTM_Conn *conn) +{ + if (conn->Pfdebug) + fflush(conn->Pfdebug); + + if (conn->outCount > 0) + return gtmpqSendSome(conn, conn->outCount); + + return 0; +} + + +/* + * gtmpqWait: wait until we can read or write the connection socket + * + * JAB: If SSL enabled and used and forRead, buffered bytes short-circuit the + * call to select(). + * + * We also stop waiting and return if the kernel flags an exception condition + * on the socket. The actual error condition will be detected and reported + * when the caller tries to read or write the socket. + */ +int +gtmpqWait(int forRead, int forWrite, GTM_Conn *conn) +{ + return gtmpqWaitTimed(forRead, forWrite, conn, (time_t) -1); +} + +/* + * gtmpqWaitTimed: wait, but not past finish_time. + * + * If finish_time is exceeded then we return failure (EOF). This is like + * the response for a kernel exception because we don't want the caller + * to try to read/write in that case. + * + * finish_time = ((time_t) -1) disables the wait limit. + */ +int +gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn, time_t finish_time) +{ + int result; + + result = gtmpqSocketCheck(conn, forRead, forWrite, finish_time); + + if (result < 0) + return EOF; /* errorMessage is already set */ + + if (result == 0) + { + printfGTMPQExpBuffer(&conn->errorMessage, + "timeout expired\n"); + return EOF; + } + + return 0; +} + +/* + * gtmpqReadReady: is select() saying the file is ready to read? + * Returns -1 on failure, 0 if not ready, 1 if ready. + */ +int +gtmpqReadReady(GTM_Conn *conn) +{ + return gtmpqSocketCheck(conn, 1, 0, (time_t) 0); +} + +/* + * gtmpqWriteReady: is select() saying the file is ready to write? + * Returns -1 on failure, 0 if not ready, 1 if ready. + */ +int +gtmpqWriteReady(GTM_Conn *conn) +{ + return gtmpqSocketCheck(conn, 0, 1, (time_t) 0); +} + +/* + * Checks a socket, using poll or select, for data to be read, written, + * or both. Returns >0 if one or more conditions are met, 0 if it timed + * out, -1 if an error occurred. + * + * If SSL is in use, the SSL buffer is checked prior to checking the socket + * for read data directly. + */ +static int +gtmpqSocketCheck(GTM_Conn *conn, int forRead, int forWrite, time_t end_time) +{ + int result; + + if (!conn) + return -1; + if (conn->sock < 0) + { + printfGTMPQExpBuffer(&conn->errorMessage, + "socket not open\n"); + return -1; + } + +#ifdef USE_SSL + /* Check for SSL library buffering read bytes */ + if (forRead && conn->ssl && SSL_pending(conn->ssl) > 0) + { + /* short-circuit the select */ + return 1; + } +#endif + + /* We will retry as long as we get EINTR */ + do + result = gtmpqSocketPoll(conn->sock, forRead, forWrite, end_time); + while (result < 0 && SOCK_ERRNO == EINTR); + + if (result < 0) + printfGTMPQExpBuffer(&conn->errorMessage, + "select() failed: \n"); + + return result; +} + + +/* + * Check a file descriptor for read and/or write data, possibly waiting. + * If neither forRead nor forWrite are set, immediately return a timeout + * condition (without waiting). Return >0 if condition is met, 0 + * if a timeout occurred, -1 if an error or interrupt occurred. + * + * Timeout is infinite if end_time is -1. Timeout is immediate (no blocking) + * if end_time is 0 (or indeed, any time before now). + */ +static int +gtmpqSocketPoll(int sock, int forRead, int forWrite, time_t end_time) +{ + /* We use poll(2) if available, otherwise select(2) */ +#ifdef HAVE_POLL + struct pollfd input_fd; + int timeout_ms; + + if (!forRead && !forWrite) + return 0; + + input_fd.fd = sock; + input_fd.events = POLLERR; + input_fd.revents = 0; + + if (forRead) + input_fd.events |= POLLIN; + if (forWrite) + input_fd.events |= POLLOUT; + + /* Compute appropriate timeout interval */ + if (end_time == ((time_t) -1)) + timeout_ms = -1; + else + { + time_t now = time(NULL); + + if (end_time > now) + timeout_ms = (end_time - now) * 1000; + else + timeout_ms = 0; + } + + return poll(&input_fd, 1, timeout_ms); +#else /* !HAVE_POLL */ + + fd_set input_mask; + fd_set output_mask; + fd_set except_mask; + struct timeval timeout; + struct timeval *ptr_timeout; + + if (!forRead && !forWrite) + return 0; + + FD_ZERO(&input_mask); + FD_ZERO(&output_mask); + FD_ZERO(&except_mask); + if (forRead) + FD_SET(sock, &input_mask); + if (forWrite) + FD_SET(sock, &output_mask); + FD_SET(sock, &except_mask); + + /* Compute appropriate timeout interval */ + if (end_time == ((time_t) -1)) + ptr_timeout = NULL; + else + { + time_t now = time(NULL); + + if (end_time > now) + timeout.tv_sec = end_time - now; + else + timeout.tv_sec = 0; + timeout.tv_usec = 0; + ptr_timeout = &timeout; + } + + return select(sock + 1, &input_mask, &output_mask, + &except_mask, ptr_timeout); +#endif /* HAVE_POLL */ +} diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c new file mode 100644 index 0000000000..f3960daeaa --- /dev/null +++ b/src/gtm/client/fe-protocol.c @@ -0,0 +1,598 @@ +/*------------------------------------------------------------------------- + * + * fe-protocol3.c + * functions that are specific to frontend/backend protocol version 3 + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" + +#include <ctype.h> +#include <fcntl.h> + +#include "gtm/libpq-fe.h" +#include "gtm/libpq-int.h" +#include "gtm/gtm_client.h" + +#include <unistd.h> +#include <netinet/in.h> + + +/* + * This macro lists the backend message types that could be "long" (more + * than a couple of kilobytes). + */ +#define VALID_LONG_MESSAGE_TYPE(id) \ + ((id) == 'S' || (id) == 'E') + +static void handleSyncLoss(GTM_Conn *conn, char id, int msgLength); +static GTM_Result *pqParseInput(GTM_Conn *conn); +static int gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result); +static int gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn); + +/* + * parseInput: if appropriate, parse input data from backend + * until input is exhausted or a stopping state is reached. + * Note that this function will NOT attempt to read more data from the backend. + */ +static GTM_Result * +pqParseInput(GTM_Conn *conn) +{ + char id; + int msgLength; + int avail; + GTM_Result *result = NULL; + + if (conn->result == NULL) + { + conn->result = (GTM_Result *) malloc(sizeof (GTM_Result)); + memset(conn->result, 0, sizeof (GTM_Result)); + } + else + gtmpqFreeResultData(conn->result, conn->is_proxy); + + result = conn->result; + + /* + * Try to read a message. First get the type code and length. Return + * if not enough data. + */ + conn->inCursor = conn->inStart; + if (gtmpqGetc(&id, conn)) + return NULL; + if (gtmpqGetInt(&msgLength, 4, conn)) + return NULL; + + /* + * Try to validate message type/length here. A length less than 4 is + * definitely broken. Large lengths should only be believed for a few + * message types. + */ + if (msgLength < 4) + { + handleSyncLoss(conn, id, msgLength); + return NULL; + } + if (msgLength > 30000 && !VALID_LONG_MESSAGE_TYPE(id)) + { + handleSyncLoss(conn, id, msgLength); + return NULL; + } + + /* + * Can't process if message body isn't all here yet. + */ + conn->result->gr_msglen = msgLength -= 4; + avail = conn->inEnd - conn->inCursor; + if (avail < msgLength) + { + /* + * Before returning, enlarge the input buffer if needed to hold + * the whole message. This is better than leaving it to + * gtmpqReadData because we can avoid multiple cycles of realloc() + * when the message is large; also, we can implement a reasonable + * recovery strategy if we are unable to make the buffer big + * enough. + */ + if (gtmpqCheckInBufferSpace(conn->inCursor + (size_t) msgLength, + conn)) + { + /* + * XXX add some better recovery code... plan is to skip over + * the message using its length, then report an error. For the + * moment, just treat this like loss of sync (which indeed it + * might be!) + */ + handleSyncLoss(conn, id, msgLength); + } + return NULL; + } + + switch (id) + { + case 'S': /* command complete */ + if (gtmpqParseSuccess(conn, result)) + return NULL; + break; + + case 'E': /* error return */ + if (gtmpqGetError(conn, result)) + return NULL; + result->gr_status = -1; + break; + default: + printfGTMPQExpBuffer(&conn->errorMessage, + "unexpected response from server; first received character was \"%c\"\n", + id); + conn->inCursor += msgLength; + break; + } /* switch on protocol character */ + /* Successfully consumed this message */ + if (conn->inCursor == conn->inStart + 5 + msgLength) + { + /* Normal case: parsing agrees with specified length */ + conn->inStart = conn->inCursor; + } + else + { + /* Trouble --- report it */ + printfGTMPQExpBuffer(&conn->errorMessage, + "message contents do not agree with length in message type \"%c\"\n", + id); + /* trust the specified message length as what to skip */ + conn->inStart += 5 + msgLength; + } + + return result; +} + +/* + * handleSyncLoss: clean up after loss of message-boundary sync + * + * There isn't really a lot we can do here except abandon the connection. + */ +static void +handleSyncLoss(GTM_Conn *conn, char id, int msgLength) +{ + printfGTMPQExpBuffer(&conn->errorMessage, + "lost synchronization with server: got message type \"%c\", length %d\n", + id, msgLength); + close(conn->sock); + conn->sock = -1; + conn->status = CONNECTION_BAD; /* No more connection to backend */ +} + +/* + * Attempt to read an Error or Notice response message. + * This is possible in several places, so we break it out as a subroutine. + * Entry: 'E' message type and length have already been consumed. + * Exit: returns 0 if successfully consumed message. + * returns EOF if not enough data. + */ +int +gtmpqGetError(GTM_Conn *conn, GTM_Result *result) +{ + char id; + + /* + * If we are a GTM proxy, expect an additional proxy header in the incoming + * message. + */ + if (conn->is_proxy) + { + if (gtmpqGetnchar((char *)&result->gr_proxyhdr, + sizeof (GTM_ProxyMsgHeader), conn)) + return 1; + result->gr_msglen -= sizeof (GTM_ProxyMsgHeader); + + /* + * If the allocated buffer is not large enough to hold the proxied + * data, realloc the buffer. + * + * Since the client side code is shared between the proxy and the + * backend, we don't want any memory context management etc here. So + * just use plain realloc. Anyways, we don't indent to free the memory. + */ + if (result->gr_proxy_datalen < result->gr_msglen) + { + result->gr_proxy_data = (char *)realloc( + result->gr_proxy_data, result->gr_msglen); + result->gr_proxy_datalen = result->gr_msglen; + } + + if (gtmpqGetnchar((char *)result->gr_proxy_data, + result->gr_msglen, conn)) + { + result->gr_status = 1; + return 1; + } + + return 0; + } + else + result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID; + + /* + * Read the fields and save into res. + */ + for (;;) + { + if (gtmpqGetc(&id, conn)) + goto fail; + if (id == '\0') + break; + if (gtmpqGets(&conn->errorMessage, conn)) + goto fail; + } + return 0; + +fail: + return EOF; +} + +/* + * GTMPQgetResult + * Get the next GTM_Result produced. Returns NULL if no + * query work remains or an error has occurred (e.g. out of + * memory). + */ + +GTM_Result * +GTMPQgetResult(GTM_Conn *conn) +{ + GTM_Result *res; + + if (!conn) + return NULL; + + /* Parse any available data, if our state permits. */ + while ((res = pqParseInput(conn)) == NULL) + { + int flushResult; + + /* + * If data remains unsent, send it. Else we might be waiting for the + * result of a command the backend hasn't even got yet. + */ + while ((flushResult = gtmpqFlush(conn)) > 0) + { + if (gtmpqWait(false, true, conn)) + { + flushResult = -1; + break; + } + } + + /* Wait for some more data, and load it. */ + if (flushResult || + gtmpqWait(true, false, conn) || + gtmpqReadData(conn) < 0) + { + /* + * conn->errorMessage has been set by gtmpqWait or gtmpqReadData. + */ + return NULL; + } + } + + return res; +} + +static int +gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) +{ + int xcnt, xsize; + GlobalTransactionId *xip = NULL; + + result->gr_status = 0; + + if (gtmpqGetInt((int *)&result->gr_type, 4, conn)) + return 1; + result->gr_msglen -= 4; + + if (conn->is_proxy) + { + if (gtmpqGetnchar((char *)&result->gr_proxyhdr, + sizeof (GTM_ProxyMsgHeader), conn)) + return 1; + result->gr_msglen -= sizeof (GTM_ProxyMsgHeader); + } + else + result->gr_proxyhdr.ph_conid = InvalidGTMProxyConnID; + + /* + * If we are dealing with a proxied message, just read the remaining binary + * data which can then be forwarded to the right backend. + */ + if (result->gr_proxyhdr.ph_conid != InvalidGTMProxyConnID) + { + /* + * If the allocated buffer is not large enough to hold the proxied + * data, realloc the buffer. + * + * Since the client side code is shared between the proxy and the + * backend, we don't want any memory context management etc here. So + * just use plain realloc. Anyways, we don't indent to free the memory. + */ + if (result->gr_proxy_datalen < result->gr_msglen) + { + result->gr_proxy_data = (char *)realloc( + result->gr_proxy_data, result->gr_msglen); + result->gr_proxy_datalen = result->gr_msglen; + } + + if (gtmpqGetnchar((char *)result->gr_proxy_data, + result->gr_msglen, conn)) + { + result->gr_status = 1; + return 1; + } + + return result->gr_status; + } + + result->gr_status = 0; + + switch (result->gr_type) + { + case TXN_BEGIN_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txnhandle, + sizeof (GTM_TransactionHandle), conn)) + result->gr_status = -1; + break; + + case TXN_BEGIN_GETGXID_RESULT: + case TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT: + case TXN_PREPARE_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid, + sizeof (GlobalTransactionId), conn)) + result->gr_status = -1; + break; + + case TXN_COMMIT_RESULT: + case TXN_ROLLBACK_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + break; + + case TXN_GET_GXID_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle, + sizeof (GTM_TransactionHandle), conn)) + { + result->gr_status = -1; + break; + } + + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid, + sizeof (GlobalTransactionId), conn)) + result->gr_status = -1; + break; + + case TXN_BEGIN_GETGXID_MULTI_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.txn_count, + sizeof (int), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_multi.start_gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + break; + + + case TXN_COMMIT_MULTI_RESULT: + case TXN_ROLLBACK_MULTI_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_rc_multi.txn_count, + sizeof (int), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_rc_multi.status, + sizeof (int) * result->gr_resdata.grd_txn_rc_multi.txn_count, conn)) + { + result->gr_status = -1; + break; + } + break; + + case SNAPSHOT_GXID_GET_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.txnhandle, + sizeof (GTM_TransactionHandle), conn)) + { + result->gr_status = -1; + break; + } + /* Fall through */ + case SNAPSHOT_GET_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn.gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + /* Fall through */ + case SNAPSHOT_GET_MULTI_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_snap_multi.txn_count, + sizeof (int), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_snap_multi.status, + sizeof (int) * result->gr_resdata.grd_txn_snap_multi.txn_count, conn)) + { + result->gr_status = -1; + break; + } + + if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmin, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + + if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmax, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + + if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_recent_global_xmin, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + + + if (gtmpqGetInt(&result->gr_snapshot.sn_xcnt, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + + xsize = result->gr_xip_size; + xcnt = result->gr_snapshot.sn_xcnt; + xip = result->gr_snapshot.sn_xip; + + if ((xip == NULL) || (xcnt > xsize)) + { + xip = (GlobalTransactionId *) realloc(xip, sizeof (GlobalTransactionId) * xcnt); + result->gr_snapshot.sn_xip = xip; + result->gr_xip_size = xcnt; + } + + if (gtmpqGetnchar((char *)xip, sizeof (GlobalTransactionId) * xcnt, conn)) + { + result->gr_status = -1; + break; + } + + break; + + case SEQUENCE_INIT_RESULT: + case SEQUENCE_RESET_RESULT: + case SEQUENCE_CLOSE_RESULT: + if (gtmpqReadSeqKey(&result->gr_resdata.grd_seqkey, conn)) + result->gr_status = -1; + break; + + case SEQUENCE_GET_CURRENT_RESULT: + case SEQUENCE_GET_NEXT_RESULT: + if (gtmpqReadSeqKey(&result->gr_resdata.grd_seq.seqkey, conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.seqval, + sizeof (GTM_Sequence), conn)) + result->gr_status = -1; + break; + + case TXN_GET_STATUS_RESULT: + break; + + case TXN_GET_ALL_PREPARED_RESULT: + break; + + default: + printfGTMPQExpBuffer(&conn->errorMessage, + "unexpected result type from server; result typr was \"%d\"\n", + result->gr_type); + result->gr_status = -1; + break; + } + + return (result->gr_status); +} + +static int +gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn) +{ + /* + * Read keylength + */ + if (gtmpqGetInt(&seqkey->gsk_keylen, 4, conn)) + return EINVAL; + + /* + * Do some sanity checks on the keylength + */ + if (seqkey->gsk_keylen <= 0 || seqkey->gsk_keylen > GTM_MAX_SEQKEY_LENGTH) + return EINVAL; + + if ((seqkey->gsk_key = (char *) malloc(seqkey->gsk_keylen)) == NULL) + return ENOMEM; + + if (gtmpqGetnchar(seqkey->gsk_key, seqkey->gsk_keylen, conn)) + return EINVAL; + + return 0; +} + +void +gtmpqFreeResultData(GTM_Result *result, bool is_proxy) +{ + /* + * If we are running as a GTM proxy, we don't have anything to do. This may + * change though as we add more message types below and some of them may + * need cleanup even at the proxy level + */ + if (is_proxy) + return; + + switch (result->gr_type) + { + case SEQUENCE_INIT_RESULT: + case SEQUENCE_RESET_RESULT: + case SEQUENCE_CLOSE_RESULT: + if (result->gr_resdata.grd_seqkey.gsk_key != NULL) + free(result->gr_resdata.grd_seqkey.gsk_key); + result->gr_resdata.grd_seqkey.gsk_key = NULL; + break; + + case SEQUENCE_GET_CURRENT_RESULT: + case SEQUENCE_GET_NEXT_RESULT: + if (result->gr_resdata.grd_seq.seqkey.gsk_key != NULL) + free(result->gr_resdata.grd_seq.seqkey.gsk_key); + result->gr_resdata.grd_seqkey.gsk_key = NULL; + break; + + case TXN_GET_STATUS_RESULT: + break; + + case TXN_GET_ALL_PREPARED_RESULT: + break; + + case SNAPSHOT_GET_RESULT: + case SNAPSHOT_GXID_GET_RESULT: + /* + * Lets not free the xip array in the snapshot since we may need it + * again shortly + */ + break; + + default: + break; + } +} diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c new file mode 100644 index 0000000000..6b22a81c53 --- /dev/null +++ b/src/gtm/client/gtm_client.c @@ -0,0 +1,515 @@ +/*------------------------------------------------------------------------- + * + * gtm-client.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +/* Time in seconds to wait for a response from GTM */ +/* We should consider making this a GUC */ +#define CLIENT_GTM_TIMEOUT 20 + +#include <time.h> + +#include "gtm/gtm_c.h" + +#include "gtm/libpq-fe.h" +#include "gtm/libpq-int.h" + +#include "gtm/gtm_client.h" +#include "gtm/gtm_msg.h" +#include "gtm/assert.h" + +void GTM_FreeResult(GTM_Result *result, bool is_proxy); + +/* + * Connection Management API + */ +GTM_Conn * +connect_gtm(const char *connect_string) +{ + return PQconnectGTM(connect_string); +} + +void +disconnect_gtm(GTM_Conn *conn) +{ + GTMPQfinish(conn); +} + +/* + * Transaction Management API + */ +GlobalTransactionId +begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel) +{ + bool txn_read_only = false; + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_BEGIN_GETGXID, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) || + gtmpqPutc(txn_read_only, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + return res->gr_resdata.grd_gxid; + else + return InvalidGlobalTransactionId; + +receive_failed: +send_failed: + return InvalidGlobalTransactionId; +} + +/* + * Transaction Management API + * Begin a transaction for an autovacuum worker process + */ +GlobalTransactionId +begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel) +{ + bool txn_read_only = false; + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) || + gtmpqPutc(txn_read_only, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + return res->gr_resdata.grd_gxid; + else + return InvalidGlobalTransactionId; + +receive_failed: +send_failed: + return InvalidGlobalTransactionId; +} +int +commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_COMMIT, sizeof (GTM_MessageType), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == TXN_COMMIT_RESULT); + Assert(res->gr_resdata.grd_gxid == gxid); + } + + return res->gr_status; + +receive_failed: +send_failed: + return -1; + +} + +int +abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_ROLLBACK, sizeof (GTM_MessageType), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == TXN_ROLLBACK_RESULT); + Assert(res->gr_resdata.grd_gxid == gxid); + } + + return res->gr_status; + +receive_failed: +send_failed: + return -1; + +} + +int +prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid, + int nodecnt, PGXC_NodeId nodes[]) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_PREPARE, sizeof (GTM_MessageType), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) || + gtmpqPutInt(nodecnt, sizeof (int), conn) || + gtmpqPutnchar((char *)nodes, sizeof (PGXC_NodeId) * nodecnt, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == TXN_PREPARE_RESULT); + Assert(res->gr_resdata.grd_gxid == gxid); + } + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + +/* + * Snapshot Management API + */ +GTM_SnapshotData * +get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SNAPSHOT_GET, sizeof (GTM_MessageType), conn) || + gtmpqPutc(canbe_grouped, conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == SNAPSHOT_GET_RESULT); + Assert(res->gr_resdata.grd_txn.gxid == gxid); + return &(res->gr_snapshot); + } + else + return NULL; + + +receive_failed: +send_failed: + return NULL; +} + +/* + * Sequence Management API + */ +int +open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, bool cycle) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_INIT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) || + gtmpqPutnchar((char *)&increment, sizeof (GTM_Sequence), conn) || + gtmpqPutnchar((char *)&minval, sizeof (GTM_Sequence), conn) || + gtmpqPutnchar((char *)&maxval, sizeof (GTM_Sequence), conn) || + gtmpqPutnchar((char *)&startval, sizeof (GTM_Sequence), conn) || + gtmpqPutc(cycle, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + +int +close_sequence(GTM_Conn *conn, GTM_SequenceKey key) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_CLOSE, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + +GTM_Sequence +get_current(GTM_Conn *conn, GTM_SequenceKey key) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + return res->gr_resdata.grd_seq.seqval; + else + return InvalidSequenceValue; + +receive_failed: +send_failed: + return -1; +} + +GTM_Sequence +get_next(GTM_Conn *conn, GTM_SequenceKey key) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + return res->gr_resdata.grd_seq.seqval; + else + return InvalidSequenceValue; + +receive_failed: +send_failed: + return -1; +} + +int +reset_sequence(GTM_Conn *conn, GTM_SequenceKey key) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_RESET, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + +void +GTM_FreeResult(GTM_Result *result, bool is_proxy) +{ + if (result == NULL) + return; + gtmpqFreeResultData(result, is_proxy); + free(result); +} diff --git a/src/gtm/client/ip.c b/src/gtm/client/ip.c new file mode 100644 index 0000000000..b210e201c5 --- /dev/null +++ b/src/gtm/client/ip.c @@ -0,0 +1,324 @@ +/*------------------------------------------------------------------------- + * + * ip.c + * IPv6-aware network access. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $ + * + * This file and the IPV6 implementation were initially provided by + * Nigel Kukard <[email protected]>, Linux Based Systems Design + * http://www.lbsd.net. + * + *------------------------------------------------------------------------- + */ + +/* This is intended to be used in both frontend and backend, so use c.h */ +#include "gtm/gtm_c.h" + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <netdb.h> +#include <netinet/in.h> +#ifdef HAVE_NETINET_TCP_H +#include <netinet/tcp.h> +#endif +#include <arpa/inet.h> +#include <sys/file.h> + +#include "gtm/gtm_ip.h" + + +static int range_sockaddr_AF_INET(const struct sockaddr_in * addr, + const struct sockaddr_in * netaddr, + const struct sockaddr_in * netmask); + +#ifdef HAVE_IPV6 +static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, + const struct sockaddr_in6 * netaddr, + const struct sockaddr_in6 * netmask); +#endif + + +/* + * gtm_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets + */ +int +gtm_getaddrinfo_all(const char *hostname, const char *servname, + const struct addrinfo * hintp, struct addrinfo ** result) +{ + int rc; + + /* not all versions of getaddrinfo() zero *result on failure */ + *result = NULL; + + /* NULL has special meaning to getaddrinfo(). */ + rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname, + servname, hintp, result); + + return rc; +} + + +/* + * gtm_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix + * + * Note: the ai_family field of the original hint structure must be passed + * so that we can tell whether the addrinfo struct was built by the system's + * getaddrinfo() routine or our own getaddrinfo_unix() routine. Some versions + * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's + * not safe to look at ai_family in the addrinfo itself. + */ +void +gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai) +{ + { + /* struct was built by getaddrinfo() */ + if (ai != NULL) + freeaddrinfo(ai); + } +} + + +/* + * gtm_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets + * + * The API of this routine differs from the standard getnameinfo() definition + * in two ways: first, the addr parameter is declared as sockaddr_storage + * rather than struct sockaddr, and second, the node and service fields are + * guaranteed to be filled with something even on failure return. + */ +int +gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags) +{ + int rc; + + rc = getnameinfo((const struct sockaddr *) addr, salen, + node, nodelen, + service, servicelen, + flags); + + if (rc != 0) + { + if (node) + strlcpy(node, "???", nodelen); + if (service) + strlcpy(service, "???", servicelen); + } + + return rc; +} + +/* + * gtm_range_sockaddr - is addr within the subnet specified by netaddr/netmask ? + * + * Note: caller must already have verified that all three addresses are + * in the same address family; and AF_UNIX addresses are not supported. + */ +int +gtm_range_sockaddr(const struct sockaddr_storage * addr, + const struct sockaddr_storage * netaddr, + const struct sockaddr_storage * netmask) +{ + if (addr->ss_family == AF_INET) + return range_sockaddr_AF_INET((struct sockaddr_in *) addr, + (struct sockaddr_in *) netaddr, + (struct sockaddr_in *) netmask); +#ifdef HAVE_IPV6 + else if (addr->ss_family == AF_INET6) + return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr, + (struct sockaddr_in6 *) netaddr, + (struct sockaddr_in6 *) netmask); +#endif + else + return 0; +} + +static int +range_sockaddr_AF_INET(const struct sockaddr_in * addr, + const struct sockaddr_in * netaddr, + const struct sockaddr_in * netmask) +{ + if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) & + netmask->sin_addr.s_addr) == 0) + return 1; + else + return 0; +} + + +#ifdef HAVE_IPV6 + +static int +range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, + const struct sockaddr_in6 * netaddr, + const struct sockaddr_in6 * netmask) +{ + int i; + + for (i = 0; i < 16; i++) + { + if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) & + netmask->sin6_addr.s6_addr[i]) != 0) + return 0; + } + + return 1; +} +#endif /* HAVE_IPV6 */ + +/* + * gtm_sockaddr_cidr_mask - make a network mask of the appropriate family + * and required number of significant bits + * + * The resulting mask is placed in *mask, which had better be big enough. + * + * Return value is 0 if okay, -1 if not. + */ +int +gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family) +{ + long bits; + char *endptr; + + bits = strtol(numbits, &endptr, 10); + + if (*numbits == '\0' || *endptr != '\0') + return -1; + + switch (family) + { + case AF_INET: + { + struct sockaddr_in mask4; + long maskl; + + if (bits < 0 || bits > 32) + return -1; + /* avoid "x << 32", which is not portable */ + if (bits > 0) + maskl = (0xffffffffUL << (32 - (int) bits)) + & 0xffffffffUL; + else + maskl = 0; + mask4.sin_addr.s_addr = htonl(maskl); + memcpy(mask, &mask4, sizeof(mask4)); + break; + } + +#ifdef HAVE_IPV6 + case AF_INET6: + { + struct sockaddr_in6 mask6; + int i; + + if (bits < 0 || bits > 128) + return -1; + for (i = 0; i < 16; i++) + { + if (bits <= 0) + mask6.sin6_addr.s6_addr[i] = 0; + else if (bits >= 8) + mask6.sin6_addr.s6_addr[i] = 0xff; + else + { + mask6.sin6_addr.s6_addr[i] = + (0xff << (8 - (int) bits)) & 0xff; + } + bits -= 8; + } + memcpy(mask, &mask6, sizeof(mask6)); + break; + } +#endif + default: + return -1; + } + + mask->ss_family = family; + return 0; +} + + +#ifdef HAVE_IPV6 + +/* + * gtm_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using + * the standard convention for IPv4 addresses mapped into IPv6 world + * + * The passed addr is modified in place; be sure it is large enough to + * hold the result! Note that we only worry about setting the fields + * that gtm_range_sockaddr will look at. + */ +void +gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + uint32 ip4addr; + + memcpy(&addr4, addr, sizeof(addr4)); + ip4addr = ntohl(addr4.sin_addr.s_addr); + + memset(&addr6, 0, sizeof(addr6)); + + addr6.sin6_family = AF_INET6; + + addr6.sin6_addr.s6_addr[10] = 0xff; + addr6.sin6_addr.s6_addr[11] = 0xff; + addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF; + addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF; + addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF; + addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF; + + memcpy(addr, &addr6, sizeof(addr6)); +} + +/* + * gtm_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using + * the standard convention for IPv4 addresses mapped into IPv6 world + * + * This must be different from gtm_promote_v4_to_v6_addr because we want to + * set the high-order bits to 1's not 0's. + * + * The passed addr is modified in place; be sure it is large enough to + * hold the result! Note that we only worry about setting the fields + * that gtm_range_sockaddr will look at. + */ +void +gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + uint32 ip4addr; + int i; + + memcpy(&addr4, addr, sizeof(addr4)); + ip4addr = ntohl(addr4.sin_addr.s_addr); + + memset(&addr6, 0, sizeof(addr6)); + + addr6.sin6_family = AF_INET6; + + for (i = 0; i < 12; i++) + addr6.sin6_addr.s6_addr[i] = 0xff; + + addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF; + addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF; + addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF; + addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF; + + memcpy(addr, &addr6, sizeof(addr6)); +} + +#endif /* HAVE_IPV6 */ diff --git a/src/gtm/client/pqexpbuffer.c b/src/gtm/client/pqexpbuffer.c new file mode 100644 index 0000000000..95c6ee09ee --- /dev/null +++ b/src/gtm/client/pqexpbuffer.c @@ -0,0 +1,373 @@ +/*------------------------------------------------------------------------- + * + * pqexpbuffer.c + * + * PQExpBuffer provides an indefinitely-extensible string data type. + * It can be used to buffer either ordinary C strings (null-terminated text) + * or arbitrary binary data. All storage is allocated with malloc(). + * + * This module is essentially the same as the backend's StringInfo data type, + * but it is intended for use in frontend libpq and client applications. + * Thus, it does not rely on palloc() nor elog(). + * + * It does rely on vsnprintf(); if configure finds that libc doesn't provide + * a usable vsnprintf(), then a copy of our own implementation of it will + * be linked into libpq. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.c,v 1.25 2008/11/26 00:26:23 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" + +#include <limits.h> + +#include "gtm/pqexpbuffer.h" + + +/* All "broken" PQExpBuffers point to this string. */ +static const char oom_buffer[1] = ""; + + +/* + * markPQExpBufferBroken + * + * Put a PQExpBuffer in "broken" state if it isn't already. + */ +static void +markPQExpBufferBroken(PQExpBuffer str) +{ + if (str->data != oom_buffer) + free(str->data); + /* + * Casting away const here is a bit ugly, but it seems preferable to + * not marking oom_buffer const. We want to do that to encourage the + * compiler to put oom_buffer in read-only storage, so that anyone who + * tries to scribble on a broken PQExpBuffer will get a failure. + */ + str->data = (char *) oom_buffer; + str->len = 0; + str->maxlen = 0; +} + +/* + * createGTMPQExpBuffer + * + * Create an empty 'PQExpBufferData' & return a pointer to it. + */ +PQExpBuffer +createGTMPQExpBuffer(void) +{ + PQExpBuffer res; + + res = (PQExpBuffer) malloc(sizeof(PQExpBufferData)); + if (res != NULL) + initGTMPQExpBuffer(res); + + return res; +} + +/* + * initGTMPQExpBuffer + * + * Initialize a PQExpBufferData struct (with previously undefined contents) + * to describe an empty string. + */ +void +initGTMPQExpBuffer(PQExpBuffer str) +{ + str->data = (char *) malloc(INITIAL_EXPBUFFER_SIZE); + if (str->data == NULL) + { + str->data = (char *) oom_buffer; /* see comment above */ + str->maxlen = 0; + str->len = 0; + } + else + { + str->maxlen = INITIAL_EXPBUFFER_SIZE; + str->len = 0; + str->data[0] = '\0'; + } +} + +/* + * destroyGTMPQExpBuffer(str); + * + * free()s both the data buffer and the PQExpBufferData. + * This is the inverse of createGTMPQExpBuffer(). + */ +void +destroyGTMPQExpBuffer(PQExpBuffer str) +{ + if (str) + { + termGTMPQExpBuffer(str); + free(str); + } +} + +/* + * termGTMPQExpBuffer(str) + * free()s the data buffer but not the PQExpBufferData itself. + * This is the inverse of initGTMPQExpBuffer(). + */ +void +termGTMPQExpBuffer(PQExpBuffer str) +{ + if (str->data != oom_buffer) + free(str->data); + /* just for luck, make the buffer validly empty. */ + str->data = (char *) oom_buffer; /* see comment above */ + str->maxlen = 0; + str->len = 0; +} + +/* + * resetGTMPQExpBuffer + * Reset a PQExpBuffer to empty + * + * Note: if possible, a "broken" PQExpBuffer is returned to normal. + */ +void +resetGTMPQExpBuffer(PQExpBuffer str) +{ + if (str) + { + if (str->data != oom_buffer) + { + str->len = 0; + str->data[0] = '\0'; + } + else + { + /* try to reinitialize to valid state */ + initGTMPQExpBuffer(str); + } + } +} + +/* + * enlargeGTMPQExpBuffer + * Make sure there is enough space for 'needed' more bytes in the buffer + * ('needed' does not include the terminating null). + * + * Returns 1 if OK, 0 if failed to enlarge buffer. (In the latter case + * the buffer is left in "broken" state.) + */ +int +enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed) +{ + size_t newlen; + char *newdata; + + if (PQExpBufferBroken(str)) + return 0; /* already failed */ + + /* + * Guard against ridiculous "needed" values, which can occur if we're fed + * bogus data. Without this, we can get an overflow or infinite loop in + * the following. + */ + if (needed >= ((size_t) INT_MAX - str->len)) + { + markPQExpBufferBroken(str); + return 0; + } + + needed += str->len + 1; /* total space required now */ + + /* Because of the above test, we now have needed <= INT_MAX */ + + if (needed <= str->maxlen) + return 1; /* got enough space already */ + + /* + * We don't want to allocate just a little more space with each append; + * for efficiency, double the buffer size each time it overflows. + * Actually, we might need to more than double it if 'needed' is big... + */ + newlen = (str->maxlen > 0) ? (2 * str->maxlen) : 64; + while (needed > newlen) + newlen = 2 * newlen; + + /* + * Clamp to INT_MAX in case we went past it. Note we are assuming here + * that INT_MAX <= UINT_MAX/2, else the above loop could overflow. We + * will still have newlen >= needed. + */ + if (newlen > (size_t) INT_MAX) + newlen = (size_t) INT_MAX; + + newdata = (char *) realloc(str->data, newlen); + if (newdata != NULL) + { + str->data = newdata; + str->maxlen = newlen; + return 1; + } + + markPQExpBufferBroken(str); + return 0; +} + +/* + * printfGTMPQExpBuffer + * Format text data under the control of fmt (an sprintf-like format string) + * and insert it into str. More space is allocated to str if necessary. + * This is a convenience routine that does the same thing as + * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer(). + */ +void +printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...) +{ + va_list args; + size_t avail; + int nprinted; + + resetGTMPQExpBuffer(str); + + if (PQExpBufferBroken(str)) + return; /* already failed */ + + for (;;) + { + /* + * Try to format the given string into the available space; but if + * there's hardly any space, don't bother trying, just fall through to + * enlarge the buffer first. + */ + if (str->maxlen > str->len + 16) + { + avail = str->maxlen - str->len - 1; + va_start(args, fmt); + nprinted = vsnprintf(str->data + str->len, avail, + fmt, args); + va_end(args); + + /* + * Note: some versions of vsnprintf return the number of chars + * actually stored, but at least one returns -1 on failure. Be + * conservative about believing whether the print worked. + */ + if (nprinted >= 0 && nprinted < (int) avail - 1) + { + /* Success. Note nprinted does not include trailing null. */ + str->len += nprinted; + break; + } + } + /* Double the buffer size and try again. */ + if (!enlargeGTMPQExpBuffer(str, str->maxlen)) + return; /* oops, out of memory */ + } +} + +/* + * appendGTMPQExpBuffer + * + * Format text data under the control of fmt (an sprintf-like format string) + * and append it to whatever is already in str. More space is allocated + * to str if necessary. This is sort of like a combination of sprintf and + * strcat. + */ +void +appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...) +{ + va_list args; + size_t avail; + int nprinted; + + if (PQExpBufferBroken(str)) + return; /* already failed */ + + for (;;) + { + /* + * Try to format the given string into the available space; but if + * there's hardly any space, don't bother trying, just fall through to + * enlarge the buffer first. + */ + if (str->maxlen > str->len + 16) + { + avail = str->maxlen - str->len - 1; + va_start(args, fmt); + nprinted = vsnprintf(str->data + str->len, avail, + fmt, args); + va_end(args); + + /* + * Note: some versions of vsnprintf return the number of chars + * actually stored, but at least one returns -1 on failure. Be + * conservative about believing whether the print worked. + */ + if (nprinted >= 0 && nprinted < (int) avail - 1) + { + /* Success. Note nprinted does not include trailing null. */ + str->len += nprinted; + break; + } + } + /* Double the buffer size and try again. */ + if (!enlargeGTMPQExpBuffer(str, str->maxlen)) + return; /* oops, out of memory */ + } +} + +/* + * appendGTMPQExpBufferStr + * Append the given string to a PQExpBuffer, allocating more space + * if necessary. + */ +void +appendGTMPQExpBufferStr(PQExpBuffer str, const char *data) +{ + appendBinaryGTMPQExpBuffer(str, data, strlen(data)); +} + +/* + * appendGTMPQExpBufferChar + * Append a single byte to str. + * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster. + */ +void +appendGTMPQExpBufferChar(PQExpBuffer str, char ch) +{ + /* Make more room if needed */ + if (!enlargeGTMPQExpBuffer(str, 1)) + return; + + /* OK, append the character */ + str->data[str->len] = ch; + str->len++; + str->data[str->len] = '\0'; +} + +/* + * appendBinaryGTMPQExpBuffer + * + * Append arbitrary binary data to a PQExpBuffer, allocating more space + * if necessary. + */ +void +appendBinaryGTMPQExpBuffer(PQExpBuffer str, const char *data, size_t datalen) +{ + /* Make more room if needed */ + if (!enlargeGTMPQExpBuffer(str, datalen)) + return; + + /* OK, append the data */ + memcpy(str->data + str->len, data, datalen); + str->len += datalen; + + /* + * Keep a trailing null in place, even though it's probably useless for + * binary data... + */ + str->data[str->len] = '\0'; +} diff --git a/src/gtm/client/strlcpy.c b/src/gtm/client/strlcpy.c new file mode 100644 index 0000000000..ae031e244c --- /dev/null +++ b/src/gtm/client/strlcpy.c @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * strlcpy.c + * strncpy done right + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $ + * + * This file was taken from OpenBSD and is used on platforms that don't + * provide strlcpy(). The OpenBSD copyright terms follow. + *------------------------------------------------------------------------- + */ + +/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */ + +/* + * Copyright (c) 1998 Todd C. Miller <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "gtm/gtm_c.h" + + +/* + * Copy src to string dst of size siz. At most siz-1 characters + * will be copied. Always NUL terminates (unless siz == 0). + * Returns strlen(src); if retval >= siz, truncation occurred. + * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0) + { + while (--n != 0) + { + if ((*d++ = *s++) == '\0') + break; + } + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) + { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return (s - src - 1); /* count does not include NUL */ +} diff --git a/src/gtm/client/test/Makefile b/src/gtm/client/test/Makefile new file mode 100644 index 0000000000..46ddbe9a6a --- /dev/null +++ b/src/gtm/client/test/Makefile @@ -0,0 +1,31 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../../../ +include $(top_build_dir)/gtm/Makefile.global + +override CPPFLAGS := -I$(top_build_dir)/gtm/client $(CPPFLAGS) + +OBJS=test_seq.o test_txn.o test_snap.o test_txnperf.o test_snapperf.o +LIBS =-lpthread +LOADLIBES=-lpthread +CFLAGS=-g -O0 + +all:test_txn test_seq test_snap test_txnperf test_snapperf + +test_txn:test_txn.o $(top_build_dir)/gtm/client/libgtmclient.a + +test_seq:test_seq.o $(top_build_dir)/gtm/client/libgtmclient.a + +test_snap:test_snap.o $(top_build_dir)/gtm/client/libgtmclient.a + +test_txnperf:test_txnperf.o $(top_build_dir)/gtm/client/libgtmclient.a + +test_snapperf:test_snapperf.o $(top_build_dir)/gtm/client/libgtmclient.a + +clean: + rm -f $(OBJS) + rm -f test_txn test_seq test_snap test_txnperf test_snapperf + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/client/test/test_proxy.sh b/src/gtm/client/test/test_proxy.sh new file mode 100644 index 0000000000..c0d3caec61 --- /dev/null +++ b/src/gtm/client/test/test_proxy.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +GTM_SERVER_HOSTNAME=gtm +GTM_SERVER_PORT=16667 + +GTM_PROXY_HOSTNAMES=(coordinator1 coordinator2 coordinator3 coordinator4 coordinator5) +GTM_PROXY_PORTS=(16666 16666 16666 16666 16666) +GTM_PROXY_COUNT=${#GTM_PROXY_HOSTNAMES[*]} + +PGXC_BASE=$HOME/pgsql_pgxc + +GTM_SERVER_PROCESS=gtm +GTM_PROXY_PROCESS=gtm_proxy +GTM_TEST_CLIENT_PROCESS=test_txnperf + +GTM_SERVER=$PGXC_BASE/src/gtm/main/$GTM_SERVER_PROCESS +GTM_PROXY=$PGXC_BASE/src/gtm/proxy/$GTM_PROXY_PROCESS +GTM_TEST_CLIENT=$PGXC_BASE/src/gtm/client/test/$GTM_TEST_CLIENT_PROCESS + +GTM_SERVER_LOG_FILE=/tmp/gtmlog +GTM_SERVER_CONTROL_FILE=/tmp/gtmcontrol +GTM_PROXY_LOG_FILE=/tmp/gtmptoxylog + + +if [ "$#" -ne "5" ]; +then + echo "Usage: test_proxy.sh <test_gtm_proxy> <num_clients> <num_xacts> <num_stmts> <num_worker_threads>" + exit; +fi + +TEST_GTM_PROXY=$1 +NUM_CLIENTS=$2 +NUM_XACTS=$3 +NUM_STMTS=$4 +NUM_THREADS=$5 + + +# Stop and kill any gtm server or proxy processes +# +ssh $GTM_SERVER_HOSTNAME "killall -9 $GTM_SERVER_PROCESS" + +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_PROXY_PROCESS" > /dev/null 2>&1 +done + +echo "Killed stale server and proxies - sleeping for 5 seconds" +sleep 5 + +# Remove any stale log and control files +# +ssh $GTM_SERVER_HOSTNAME "rm -f $GTM_SERVER_LOG_FILE $GTM_SERVER_CONTROL_FILE" +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f ${GTM_PROXY_LOG_FILE}_$index" +done + +# Create a output directoty to store all test related data +# +OUTPUT_DIR=output +dir=`date "+%F-%H-%M-%S"` +echo "Creating output directory $OUTPUT_DIR/$dir" +mkdir -p $OUTPUT_DIR/$dir + + +# Start the GTM server +# +echo "Starting GTM server at $GTM_SERVER_HOSTNAME on port $GTM_SERVER_PORT" +ssh $GTM_SERVER_HOSTNAME "$GTM_SERVER -h $GTM_SERVER_HOSTNAME -p $GTM_SERVER_PORT -l $GTM_SERVER_LOG_FILE&"& + +echo "Sleeping for 3 seconds" +sleep 3 + +# Start the GTM proxy on all nodes +# +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + echo "Starting GTM proxy at ${GTM_PROXY_HOSTNAMES[$index]} on port ${GTM_PROXY_PORTS[$index]} - $NUM_THREADS worker threads" + ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_PROXY -h ${GTM_PROXY_HOSTNAMES[$index]} -p ${GTM_PROXY_PORTS[$index]} -s $GTM_SERVER_HOSTNAME -t $GTM_SERVER_PORT -n $NUM_THREADS -l ${GTM_PROXY_LOG_FILE}_$index&"& +done + +echo "Sleeping for 3 seconds" +sleep 3 + +# Kill all clients +# +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 $GTM_TEST_CLIENT_PROCESS" > /dev/null 2>&1 +done + +echo "Killed all stale clients -- sleeping for 5 seconds" +sleep 5 + +# Remove any stale result files +# +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_OUTPUT_$index TEST_OUTPUT_$index.CSV TEST_END_$index" +done + +# Write out some information about the test configuration +# +if ( $TEST_GTM_PROXY -eq true ); +then + echo "Testing GTM Proxy Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "Number of GTM Proxy Worker Threads $NUM_THREADS" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY +else + echo "Testing GTM Server Configuration" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY + echo "" >> $OUTPUT_DIR/$dir/TEST_SUMMARY +fi + +# Start the stats collection scripts . Kill any stale commands and remove the old files first +# +ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1 +ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1 +ssh $GTM_SERVER_HOSTNAME "vmstat 1 > TEST_VMSTATS_GTM&"& + +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1 + ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1 + ssh ${GTM_PROXY_HOSTNAMES[$index]} "vmstat 1 > TEST_VMSTATS_$index&"& +done + +# Start the clients +# +rm -f TEST_END* + +echo "Starting clients" +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + if ( $TEST_GTM_PROXY -eq true ); + then + SERVER_HOSTNAME=${GTM_PROXY_HOSTNAMES[$index]}; + SERVER_PORT=${GTM_PROXY_PORTS[$index]}; + else + SERVER_HOSTNAME=$GTM_SERVER_HOSTNAME; + SERVER_PORT=$GTM_SERVER_PORT; + fi + ssh ${GTM_PROXY_HOSTNAMES[$index]} "$GTM_TEST_CLIENT -h $SERVER_HOSTNAME -p $SERVER_PORT -c $NUM_CLIENTS -n $NUM_XACTS -s $NUM_STMTS -i $index &"& +done + +# Wait for all the clients to finish +# +while (true) +do + all_done=true + for index in ${!GTM_PROXY_HOSTNAMES[*]} + do + scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_END_$index . > /dev/null 2>&1 + if ! [ -f TEST_END_$index ]; + then + all_done=false; + fi; + done + + if ( $all_done -eq true ); then break; fi + sleep 5; +done + +echo "All clients finished" + +# Copy GTM server log files +# +scp $GTM_SERVER_HOSTNAME:$GTM_SERVER_LOG_FILE $OUTPUT_DIR/$dir > /dev/null 2>&1 + +# Copy GTM server vmstat file +scp $GTM_SERVER_HOSTNAME:TEST_VMSTATS_GTM $OUTPUT_DIR/$dir > /dev/null 2>&1 +ssh $GTM_SERVER_HOSTNAME "killall -9 vmstat" > /dev/null 2>&1 +ssh $GTM_SERVER_HOSTNAME "rm -f TEST_VMSTATS_GTM" > /dev/null 2>&1 + +# Copy GTM Proxy log file and the results +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1 + scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_OUTPUT_$index.CSV $OUTPUT_DIR/$dir/ > /dev/null 2>&1 + scp ${GTM_PROXY_HOSTNAMES[$index]}:${GTM_PROXY_LOG_FILE}_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1 + scp ${GTM_PROXY_HOSTNAMES[$index]}:TEST_VMSTATS_$index $OUTPUT_DIR/$dir/ > /dev/null 2>&1 + ssh ${GTM_PROXY_HOSTNAMES[$index]} "killall -9 vmstat" > /dev/null 2>&1 + ssh ${GTM_PROXY_HOSTNAMES[$index]} "rm -f TEST_VMSTATS_$index" > /dev/null 2>&1 +done + +# Paste the result in the summary file +# +for index in ${!GTM_PROXY_HOSTNAMES[*]} +do + cat $OUTPUT_DIR/$dir/TEST_OUTPUT_$index >> $OUTPUT_DIR/$dir/TEST_SUMMARY +done + +echo "Done" diff --git a/src/gtm/client/test/test_seq.c b/src/gtm/client/test/test_seq.c new file mode 100644 index 0000000000..da0ed91ee2 --- /dev/null +++ b/src/gtm/client/test/test_seq.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + */ +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" + +#define client_log(x) printf x + +int +main(int argc, char *argv[]) +{ + int ii; + pid_t parent_pid; + + GTM_Conn *conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + if (conn == NULL) + { + client_log(("Error in connection")); + exit(1); + } + + parent_pid = getpid(); + + /* + * Create sequences + */ + for (ii = 0; ii < 20; ii++) + { + char buf[100]; + GTM_SequenceKeyData seqkey; + sprintf(buf, "%d:%d", ii, ii); + seqkey.gsk_keylen = strlen(buf); + seqkey.gsk_key = buf; + if (open_sequence(conn, &seqkey, 10, 1, 10000, 100, false)) + client_log(("Open seq failed\n")); + else + client_log(("Opened Sequence %s\n", seqkey.gsk_key)); + } + + /* + * Close the GTM connection + */ + GTMPQfinish(conn); + + /* + * Start few process which would independently use the sequences + */ + for (ii = 0; ii < 3; ii++) + fork(); + + /* + * Each process now opens a new connection with the GTM + */ + conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + + /* + * Try to read/increment the sequence + */ + for (ii = 0; ii < 20; ii++) + { + char buf[100]; + GTM_SequenceKeyData seqkey; + GTM_Sequence seqval; + int jj; + + sprintf(buf, "%d:%d", ii, ii); + seqkey.gsk_keylen = strlen(buf); + seqkey.gsk_key = buf; + if ((seqval = get_current(conn, &seqkey)) == InvalidSequenceValue) + client_log(("get_current seq failed for sequene %s\n", seqkey.gsk_key)); + else + client_log(("CURRENT SEQVAL(%s): %lld\n", seqkey.gsk_key, seqval)); + + for (jj = 0; jj < 5; jj++) + { + if ((seqval = get_next(conn, &seqkey)) == InvalidSequenceValue) + client_log(("get_current seq failed for sequence %s\n", seqkey.gsk_key)); + else + client_log(("NEXT SEQVAL(%s): %lld ", seqkey.gsk_key, seqval)); + } + client_log(("\n")); + } + + /* + * The main process now closes the sequences. We want to call close only + * once, hence this approach + */ + if (getpid() == parent_pid) + { + /* + * Wait long enough so that all other processes are done + */ + sleep(20); + for (ii = 0; ii < 20; ii++) + { + char buf[100]; + GTM_SequenceKeyData seqkey; + sprintf(buf, "%d:%d", ii, ii); + seqkey.gsk_keylen = strlen(buf); + seqkey.gsk_key = buf; + if (close_sequence(conn, &seqkey)) + client_log(("Close seq failed for sequence %s\n", seqkey.gsk_key)); + else + client_log(("Sequene closed %s\n", seqkey.gsk_key)); + } + } + GTMPQfinish(conn); + return 0; +} diff --git a/src/gtm/client/test/test_snap.c b/src/gtm/client/test/test_snap.c new file mode 100644 index 0000000000..a2ce2f965a --- /dev/null +++ b/src/gtm/client/test/test_snap.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + */ +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" + +#define client_log(x) printf x + +int +main(int argc, char *argv[]) +{ + int ii; + GlobalTransactionId gxid[4000]; + GTM_Conn *conn; + + for (ii = 0; ii < 3; ii++) + fork(); + + conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + if (conn == NULL) + { + client_log(("Error in connection\n")); + exit(1); + } + + for (ii = 0; ii < 20; ii++) + { + gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC); + if (gxid[ii] != InvalidGlobalTransactionId) + client_log(("Started a new transaction (GXID:%u)\n", gxid[ii])); + else + client_log(("BEGIN transaction failed for ii=%d\n", ii)); + } + + for (ii = 0; ii < 5; ii++) + { + int jj; + GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true); + if (snapshot != NULL) + { + client_log(("Snapshot: GXID %u, xmin=%u, xmax=%u\n", gxid[ii], + snapshot->sn_xmin, snapshot->sn_xmax)); + client_log(("xcnt=%d %s", snapshot->sn_xcnt, + snapshot->sn_xcnt > 0 ? "xip=(" : "")); + for (jj = 0; jj < snapshot->sn_xcnt; jj++) + client_log(("%d%c ", snapshot->sn_xip[jj], + ((jj + 1) == snapshot->sn_xcnt) ? ')' : ',')); + client_log(("\n")); + } + } + + for (ii = 0; ii < 20; ii++) + { + PGXC_NodeId nodes[5]; + nodes[0] = 1; + nodes[1] = 1; + + if (!prepare_transaction(conn, gxid[ii], 2, nodes)) + client_log(("PREPARE successful (GXID:%u)\n", gxid[ii])); + else + client_log(("PREPARE failed (GXID:%u)\n", gxid[ii])); + } + + for (ii = 0; ii < 20; ii++) + { + if (ii % 2 == 0) + { + if (!abort_transaction(conn, gxid[ii])) + client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii])); + else + client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii])); + } + else + { + if (!commit_transaction(conn, gxid[ii])) + client_log(("COMMIT successful (GXID:%u)\n", gxid[ii])); + else + client_log(("COMMIT failed (GXID:%u)\n", gxid[ii])); + } + } + + GTMPQfinish(conn); + return 0; +} diff --git a/src/gtm/client/test/test_snapperf.c b/src/gtm/client/test/test_snapperf.c new file mode 100644 index 0000000000..bc0e511e2b --- /dev/null +++ b/src/gtm/client/test/test_snapperf.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + */ + +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" + +#define client_log(x) + +int +main(int argc, char *argv[]) +{ + int ii; + int jj; + +#define TXN_COUNT 10000 +#define LOOP_COUNT 10 + + GlobalTransactionId gxid[TXN_COUNT]; + GTM_Conn *conn; + + conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + if (conn == NULL) + { + client_log(("Error in connection\n")); + exit(1); + } + + for (jj = 0; jj < LOOP_COUNT; jj++) + { + for (ii = 0; ii < TXN_COUNT; ii++) + { + int kk; + GTM_Snapshot snapshot; + + gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC); + if (gxid[ii] != InvalidGlobalTransactionId) + client_log(("Started a new transaction (GXID:%u)\n", gxid[ii])); + else + client_log(("BEGIN transaction failed for ii=%d\n", ii)); + snapshot = get_snapshot(conn, gxid[ii], true); + + + if (ii % 2 == 0) + { + if (!abort_transaction(conn, gxid[ii])) + client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii])); + else + client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii])); + } + else + { + if (!commit_transaction(conn, gxid[ii])) + client_log(("COMMIT successful (GXID:%u)\n", gxid[ii])); + else + client_log(("COMMIT failed (GXID:%u)\n", gxid[ii])); + } + } + } + + GTMPQfinish(conn); + return 0; +} diff --git a/src/gtm/client/test/test_txn.c b/src/gtm/client/test/test_txn.c new file mode 100644 index 0000000000..01ed3decbd --- /dev/null +++ b/src/gtm/client/test/test_txn.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + */ + +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" + +#define client_log(x) printf x + +int +main(int argc, char *argv[]) +{ + int ii; + GlobalTransactionId gxid[4000]; + GTM_Conn *conn; + + for (ii = 0; ii < 3; ii++) + fork(); + + conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + if (conn == NULL) + { + client_log(("Error in connection\n")); + exit(1); + } + + for (ii = 0; ii < 20; ii++) + { + gxid[ii] = begin_transaction(conn, GTM_ISOLATION_SERIALIZABLE); + if (gxid[ii] != InvalidGlobalTransactionId) + client_log(("Started a new transaction (GXID:%u)\n", gxid[ii])); + else + client_log(("BEGIN transaction failed for ii=%d\n", ii)); + } + + for (ii = 0; ii < 20; ii++) + { + PGXC_NodeId nodes[5]; + nodes[0] = 1; + nodes[1] = 1; + + if (!prepare_transaction(conn, gxid[ii], 2, nodes)) + client_log(("PREPARE successful (GXID:%u)\n", gxid[ii])); + else + client_log(("PREPARE failed (GXID:%u)\n", gxid[ii])); + } + + for (ii = 0; ii < 20; ii++) + { + if (ii % 2 == 0) + { + if (!abort_transaction(conn, gxid[ii])) + client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii])); + else + client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii])); + } + else + { + if (!commit_transaction(conn, gxid[ii])) + client_log(("COMMIT successful (GXID:%u)\n", gxid[ii])); + else + client_log(("COMMIT failed (GXID:%u)\n", gxid[ii])); + } + } + + GTMPQfinish(conn); + return 0; +} diff --git a/src/gtm/client/test/test_txnperf.c b/src/gtm/client/test/test_txnperf.c new file mode 100644 index 0000000000..174f0a8bab --- /dev/null +++ b/src/gtm/client/test/test_txnperf.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + */ +#include <sys/types.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" +#include "gtm/gtm_client.h" +#include <sys/time.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/wait.h> + +#define client_log(x) + +extern int optind; +extern char *optarg; + +/* Calculate time difference */ +static void +diffTime(struct timeval *t1, struct timeval *t2, struct timeval *result) +{ + int sec = t1->tv_sec - t2->tv_sec; + int usec = t1->tv_usec - t2->tv_usec; + if (usec < 0) + { + usec += 1000000; + sec--; + } + result->tv_sec = sec; + result->tv_usec = usec; +} + +/* + * Help display should match + */ +static void +help(const char *progname) +{ + printf(_("Usage:\n %s [OPTION]...\n\n"), progname); + printf(_("Options:\n")); + printf(_(" -h hostname GTM proxy/server hostname/IP\n")); + printf(_(" -p port GTM proxy/serevr port number\n")); + printf(_(" -c count Number of clients\n")); + printf(_(" -n count Number of transactions per client\n")); + printf(_(" -s count Number of statements per transaction\n")); + printf(_(" -i id Coordinator ID\n")); +} + +int +main(int argc, char *argv[]) +{ + int ii; + int jj; + int kk; + char connect_string[100]; + int gtmport; + int coordinator_id; + int nclients; + int ntxns_per_cli; + int nstmts_per_txn; + char *gtmhost; + char opt; + struct timeval starttime, endtime, diff; + FILE *fp; + FILE *fp2; + char buf[1024]; + int testid, this_testid, max_testid; + int snapsize = 0; + float avg_sanpsize = 0; + pid_t child_pids[1024]; + pid_t parent_pid; + +#define TXN_COUNT 1000 + + GlobalTransactionId gxid[TXN_COUNT]; + GTM_Conn *conn; + char test_output[256], test_end[256], test_output_csv[256]; + char system_cmd[1024]; + + /* + * Catch standard options before doing much else + */ + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + help(argv[0]); + exit(0); + } + } + + /* + * Parse the command like options and set variables + */ + while ((opt = getopt(argc, argv, "h:p:c:n:s:i:")) != -1) + { + switch (opt) + { + case 'h': + gtmhost = strdup(optarg); + break; + + case 'p': + gtmport = atoi(optarg); + break; + + case 'c': + nclients = atoi(optarg); + break; + + case 'n': + ntxns_per_cli = atoi(optarg); + break; + + case 's': + nstmts_per_txn = atoi(optarg); + break; + + case 'i': + coordinator_id = atoi(optarg); + sprintf(test_output, "TEST_OUTPUT_%d\0", coordinator_id); + sprintf(test_end, "TEST_END_%d\0", coordinator_id); + sprintf(test_output_csv, "TEST_OUTPUT_%d.CSV\0", coordinator_id); + break; + + default: + fprintf(stderr, "Unrecognized option %c\n", opt); + help(argv[0]); + exit(0); + } + } + + sprintf(connect_string, "host=%s port=%d coordinator_id=%d", gtmhost, gtmport, coordinator_id); + + sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output); + system(system_cmd); + sprintf(system_cmd, "date >> %s", test_output); + system(system_cmd); + sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output); + system(system_cmd); + + fp = fopen(test_output, "a+"); + fp2 = fopen(test_output_csv, "a+"); + + max_testid = 0; + while (fgets(buf, 1024, fp) != NULL) + { + if (sscanf(buf, "TEST-ID: %d", &testid) == 1) + { + if (max_testid < testid) + max_testid = testid; + } + } + + this_testid = max_testid + 1; + + fprintf(fp, "TEST-ID: %d", this_testid); + fprintf(fp, "\n\n"); + fflush(fp); + + parent_pid = getpid(); + + gettimeofday(&starttime, NULL); + + /* + * Start as many clients + */ + for (ii = 1; ii < nclients; ii++) + { + int cpid; + if ((cpid = fork()) == 0) + break; + else + child_pids[ii-1] = cpid; + } + + if (getpid() == parent_pid) + fprintf(stderr, "started %d clients\n", nclients); + + conn = PQconnectGTM(connect_string); + if (conn == NULL) + { + client_log(("Error in connection\n")); + exit(1); + } + + if (getpid() != parent_pid) + gettimeofday(&starttime, NULL); + + snapsize = 0; + + for (jj = 0; jj <= ntxns_per_cli / TXN_COUNT; jj++) + { + for (ii = 0; ii < TXN_COUNT; ii++) + { + PGXC_NodeId nodes[5]; + + if ((jj * TXN_COUNT) + ii >= ntxns_per_cli) + break; + + gxid[ii] = begin_transaction(conn, GTM_ISOLATION_RC); + if (gxid[ii] != InvalidGlobalTransactionId) + client_log(("Started a new transaction (GXID:%u)\n", gxid[ii])); + else + client_log(("BEGIN transaction failed for ii=%d\n", ii)); + + for (kk = 0; kk < nstmts_per_txn; kk++) + { + GTM_Snapshot snapshot = get_snapshot(conn, gxid[ii], true); + snapsize += snapshot->sn_xcnt; + } + + nodes[0] = 1; + nodes[1] = 1; + + if (!prepare_transaction(conn, gxid[ii], 2, nodes)) + client_log(("PREPARE successful (GXID:%u)\n", gxid[ii])); + else + client_log(("PREPARE failed (GXID:%u)\n", gxid[ii])); + + if (ii % 2 == 0) + { + if (!abort_transaction(conn, gxid[ii])) + client_log(("ROLLBACK successful (GXID:%u)\n", gxid[ii])); + else + client_log(("ROLLBACK failed (GXID:%u)\n", gxid[ii])); + } + else + { + if (!commit_transaction(conn, gxid[ii])) + client_log(("COMMIT successful (GXID:%u)\n", gxid[ii])); + else + client_log(("COMMIT failed (GXID:%u)\n", gxid[ii])); + } + } + + fprintf(stderr, "client [%d] finished %d transactions\n", getpid(), (jj * TXN_COUNT) + ii); + } + + GTMPQfinish(conn); + + if (parent_pid == getpid()) + { + for (ii = 1; ii < nclients; ii++) + wait(NULL); + + gettimeofday(&endtime, NULL); + diffTime(&endtime, &starttime, &diff); + avg_sanpsize = ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn); + + fprintf(fp, "\n"); + fprintf(fp, "Num of client: %d\n", nclients); + fprintf(fp, "Num of txns/client: %d\n", ntxns_per_cli); + fprintf(fp, "Num of statements/txn: %d\n", nstmts_per_txn); + fprintf(fp, "TPS: %2f\n", (ntxns_per_cli * nclients) / ((float)((diff.tv_sec * 1000000) + diff.tv_usec)/1000000)); + fprintf(fp, "Total snapshot size: %d\n", snapsize); + fprintf(fp, "Average snapshot size: %f\n", avg_sanpsize); + + fprintf(fp, "Time: %d.%d\n", diff.tv_sec, diff.tv_usec); + fprintf(fp, "\n"); + + sprintf(system_cmd, "touch %s\0", test_end); + system(system_cmd); + } + else + { + gettimeofday(&endtime, NULL); + diffTime(&endtime, &starttime, &diff); + avg_sanpsize = ((float) snapsize) / (ntxns_per_cli * nstmts_per_txn); + } + + flock(fileno(fp2), LOCK_EX); + if (parent_pid != getpid()) + fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,false\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize); + else + fprintf(fp2, "%d,%d,%d,%d,%d,%d,%d,%f,true\n", this_testid, nclients, ntxns_per_cli, nstmts_per_txn, diff.tv_sec, diff.tv_usec, snapsize, avg_sanpsize); + + flock(fileno(fp2), LOCK_UN); + fclose(fp2); + + fclose(fp); + + return 0; +} diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile new file mode 100644 index 0000000000..104382c9c9 --- /dev/null +++ b/src/gtm/common/Makefile @@ -0,0 +1,25 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +NAME=gtm + + +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 + + +OBJS=aset.o mcxt.o elog.o assert.o stringinfo.o gtm_lock.o gtm_list.o + +all:all-lib + +include $(top_build_dir)/Makefile.shlib + +clean: + rm -f $(OBJS) + rm -f libgtm.so libgtm.so.1 libgtm.so.1.0 + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/common/aset.c b/src/gtm/common/aset.c new file mode 100644 index 0000000000..aa9533009a --- /dev/null +++ b/src/gtm/common/aset.c @@ -0,0 +1,1261 @@ +/*------------------------------------------------------------------------- + * + * aset.c + * Allocation set definitions. + * + * AllocSet is our standard implementation of the abstract MemoryContext + * type. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/mmgr/aset.c,v 1.77 2008/04/11 22:54:23 tgl Exp $ + * + * NOTE: + * This is a new (Feb. 05, 1999) implementation of the allocation set + * routines. AllocSet...() does not use OrderedSet...() any more. + * Instead it manages allocations in a block pool by itself, combining + * many small allocations in a few bigger blocks. AllocSetFree() normally + * doesn't free() memory really. It just add's the free'd area to some + * list for later reuse by AllocSetAlloc(). All memory blocks are free()'d + * at once on AllocSetReset(), which happens when the memory context gets + * destroyed. + * Jan Wieck + * + * Performance improvement from Tom Lane, 8/99: for extremely large request + * sizes, we do want to be able to give the memory back to free() as soon + * as it is pfree()'d. Otherwise we risk tying up a lot of memory in + * freelist entries that might never be usable. This is specially needed + * when the caller is repeatedly repalloc()'ing a block bigger and bigger; + * the previous instances of the block were guaranteed to be wasted until + * AllocSetReset() under the old way. + * + * Further improvement 12/00: as the code stood, request sizes in the + * midrange between "small" and "large" were handled very inefficiently, + * because any sufficiently large free chunk would be used to satisfy a + * request, even if it was much larger than necessary. This led to more + * and more wasted space in allocated chunks over time. To fix, get rid + * of the midrange behavior: we now handle only "small" power-of-2-size + * chunks as chunks. Anything "large" is passed off to malloc(). Change + * the number of freelists to change the small/large boundary. + * + * + * About CLOBBER_FREED_MEMORY: + * + * If this symbol is defined, all freed memory is overwritten with 0x7F's. + * This is useful for catching places that reference already-freed memory. + * + * About MEMORY_CONTEXT_CHECKING: + * + * Since we usually round request sizes up to the next power of 2, there + * is often some unused space immediately after a requested data area. + * Thus, if someone makes the common error of writing past what they've + * requested, the problem is likely to go unnoticed ... until the day when + * there *isn't* any wasted space, perhaps because of different memory + * alignment on a new platform, or some other effect. To catch this sort + * of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond + * the requested space whenever the request is less than the actual chunk + * size, and verifies that the byte is undamaged when the chunk is freed. + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" +#include "gtm/memutils.h" +#include "gtm/elog.h" +#include "gtm/assert.h" +#include "gtm/gtm.h" + +/* Define this to detail debug alloc information */ +/* #define HAVE_ALLOCINFO */ + +/*-------------------- + * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS), + * for k = 0 .. ALLOCSET_NUM_FREELISTS-1. + * + * Note that all chunks in the freelists have power-of-2 sizes. This + * improves recyclability: we may waste some space, but the wasted space + * should stay pretty constant as requests are made and released. + * + * A request too large for the last freelist is handled by allocating a + * dedicated block from malloc(). The block still has a block header and + * chunk header, but when the chunk is freed we'll return the whole block + * to malloc(), not put it on our freelists. + * + * CAUTION: ALLOC_MINBITS must be large enough so that + * 1<<ALLOC_MINBITS is at least MAXALIGN, + * or we may fail to align the smallest chunks adequately. + * 8-byte alignment is enough on all currently known machines. + * + * With the current parameters, request sizes up to 8K are treated as chunks, + * larger requests go into dedicated blocks. Change ALLOCSET_NUM_FREELISTS + * to adjust the boundary point. + *-------------------- + */ + +#define ALLOC_MINBITS 3 /* smallest chunk size is 8 bytes */ +#define ALLOCSET_NUM_FREELISTS 11 +#define ALLOC_CHUNK_LIMIT (1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS)) +/* Size of largest chunk that we use a fixed size for */ + +/*-------------------- + * The first block allocated for an allocset has size initBlockSize. + * Each time we have to allocate another block, we double the block size + * (if possible, and without exceeding maxBlockSize), so as to reduce + * the bookkeeping load on malloc(). + * + * Blocks allocated to hold oversize chunks do not follow this rule, however; + * they are just however big they need to be to hold that single chunk. + *-------------------- + */ + +#define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData)) +#define ALLOC_CHUNKHDRSZ MAXALIGN(sizeof(AllocChunkData)) + +typedef struct AllocBlockData *AllocBlock; /* forward reference */ +typedef struct AllocChunkData *AllocChunk; + +/* + * AllocPointer + * Aligned pointer which may be a member of an allocation set. + */ +typedef void *AllocPointer; + +/* + * AllocSetContext is our standard implementation of MemoryContext. + * + * Note: isReset means there is nothing for AllocSetReset to do. This is + * different from the aset being physically empty (empty blocks list) because + * we may still have a keeper block. It's also different from the set being + * logically empty, because we don't attempt to detect pfree'ing the last + * active chunk. + */ +typedef struct AllocSetContext +{ + MemoryContextData header; /* Standard memory-context fields */ + /* Info about storage allocated in this context: */ + AllocBlock blocks; /* head of list of blocks in this set */ + AllocChunk freelist[ALLOCSET_NUM_FREELISTS]; /* free chunk lists */ + bool isReset; /* T = no space alloced since last reset */ + /* Allocation parameters for this context: */ + Size initBlockSize; /* initial block size */ + Size maxBlockSize; /* maximum block size */ + Size nextBlockSize; /* next block size to allocate */ + Size allocChunkLimit; /* effective chunk size limit */ + AllocBlock keeper; /* if not NULL, keep this block over resets */ +} AllocSetContext; + +typedef AllocSetContext *AllocSet; + +/* + * AllocBlock + * An AllocBlock is the unit of memory that is obtained by aset.c + * from malloc(). It contains one or more AllocChunks, which are + * the units requested by palloc() and freed by pfree(). AllocChunks + * cannot be returned to malloc() individually, instead they are put + * on freelists by pfree() and re-used by the next palloc() that has + * a matching request size. + * + * AllocBlockData is the header data for a block --- the usable space + * within the block begins at the next alignment boundary. + */ +typedef struct AllocBlockData +{ + AllocSet aset; /* aset that owns this block */ + AllocBlock next; /* next block in aset's blocks list */ + char *freeptr; /* start of free space in this block */ + char *endptr; /* end of space in this block */ +} AllocBlockData; + +/* + * AllocChunk + * The prefix of each piece of memory in an AllocBlock + * + * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. + */ +typedef struct AllocChunkData +{ + /* aset is the owning aset if allocated, or the freelist link if free */ + void *aset; + /* size is always the size of the usable space in the chunk */ + Size size; +#ifdef MEMORY_CONTEXT_CHECKING + /* when debugging memory usage, also store actual requested size */ + /* this is zero in a free chunk */ + Size requested_size; +#endif +} AllocChunkData; + +/* + * AllocPointerIsValid + * True iff pointer is valid allocation pointer. + */ +#define AllocPointerIsValid(pointer) PointerIsValid(pointer) + +/* + * AllocSetIsValid + * True iff set is valid allocation set. + */ +#define AllocSetIsValid(set) PointerIsValid(set) + +#define AllocPointerGetChunk(ptr) \ + ((AllocChunk)(((char *)(ptr)) - ALLOC_CHUNKHDRSZ)) +#define AllocChunkGetPointer(chk) \ + ((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ)) + +/* + * These functions implement the MemoryContext API for AllocSet contexts. + */ +static void *AllocSetAlloc(MemoryContext context, Size size); +static void AllocSetFree(MemoryContext context, void *pointer); +static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size); +static void AllocSetInit(MemoryContext context); +static void AllocSetReset(MemoryContext context); +static void AllocSetDelete(MemoryContext context); +static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer); +static bool AllocSetIsEmpty(MemoryContext context); +static void AllocSetStats(MemoryContext context, int level); + +#ifdef MEMORY_CONTEXT_CHECKING +static void AllocSetCheck(MemoryContext context); +#endif + +/* + * This is the virtual function table for AllocSet contexts. + */ +static MemoryContextMethods AllocSetMethods = { + AllocSetAlloc, + AllocSetFree, + AllocSetRealloc, + AllocSetInit, + AllocSetReset, + AllocSetDelete, + AllocSetGetChunkSpace, + AllocSetIsEmpty, + AllocSetStats +#ifdef MEMORY_CONTEXT_CHECKING + ,AllocSetCheck +#endif +}; + + +/* ---------- + * Debug macros + * ---------- + */ +#ifdef HAVE_ALLOCINFO +#define AllocFreeInfo(_cxt, _chunk) \ + fprintf(stderr, "AllocFree: %s: %p, %d\n", \ + (_cxt)->header.name, (_chunk), (_chunk)->size) +#define AllocAllocInfo(_cxt, _chunk) \ + fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \ + (_cxt)->header.name, (_chunk), (_chunk)->size) +#else +#define AllocFreeInfo(_cxt, _chunk) +#define AllocAllocInfo(_cxt, _chunk) +#endif + +/* ---------- + * AllocSetFreeIndex - + * + * Depending on the size of an allocation compute which freechunk + * list of the alloc set it belongs to. Caller must have verified + * that size <= ALLOC_CHUNK_LIMIT. + * ---------- + */ +static inline int +AllocSetFreeIndex(Size size) +{ + int idx = 0; + + if (size > 0) + { + size = (size - 1) >> ALLOC_MINBITS; + while (size != 0) + { + idx++; + size >>= 1; + } + Assert(idx < ALLOCSET_NUM_FREELISTS); + } + + return idx; +} + +#ifdef RANDOMIZE_ALLOCATED_MEMORY + +/* + * Fill a just-allocated piece of memory with "random" data. It's not really + * very random, just a repeating sequence with a length that's prime. What + * we mainly want out of it is to have a good probability that two palloc's + * of the same number of bytes start out containing different data. + */ +static void +randomize_mem(char *ptr, size_t size) +{ + static int save_ctr = 1; + int ctr; + + ctr = save_ctr; + while (size-- > 0) + { + *ptr++ = ctr; + if (++ctr > 251) + ctr = 1; + } + save_ctr = ctr; +} + +#endif /* RANDOMIZE_ALLOCATED_MEMORY */ + + +/* + * Public routines + */ + + +/* + * AllocSetContextCreate + * Create a new AllocSet context. + * + * parent: parent context, or NULL if top-level context + * name: name of context (for debugging --- string will be copied) + * minContextSize: minimum context size + * initBlockSize: initial allocation block size + * maxBlockSize: maximum allocation block size + */ +MemoryContext +AllocSetContextCreate(MemoryContext parent, + const char *name, + Size minContextSize, + Size initBlockSize, + Size maxBlockSize, + bool isShared) +{ + AllocSet context; + + /* Do the type-independent part of context creation */ + context = (AllocSet) MemoryContextCreate(sizeof(AllocSetContext), + &AllocSetMethods, + parent, + name); + + /* + * Make sure alloc parameters are reasonable, and save them. + * + * We somewhat arbitrarily enforce a minimum 1K block size. + */ + initBlockSize = MAXALIGN(initBlockSize); + if (initBlockSize < 1024) + initBlockSize = 1024; + maxBlockSize = MAXALIGN(maxBlockSize); + if (maxBlockSize < initBlockSize) + maxBlockSize = initBlockSize; + context->initBlockSize = initBlockSize; + context->maxBlockSize = maxBlockSize; + context->nextBlockSize = initBlockSize; + + /* + * Compute the allocation chunk size limit for this context. It can't be + * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists. + * If maxBlockSize is small then requests exceeding the maxBlockSize + * should be treated as large chunks, too. We have to have + * allocChunkLimit a power of two, because the requested and + * actually-allocated sizes of any chunk must be on the same side of the + * limit, else we get confused about whether the chunk is "big". + */ + context->allocChunkLimit = ALLOC_CHUNK_LIMIT; + while (context->allocChunkLimit > + (Size) (maxBlockSize - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ)) + context->allocChunkLimit >>= 1; + + /* + * Grab always-allocated space, if requested + */ + if (minContextSize > ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ) + { + Size blksize = MAXALIGN(minContextSize); + AllocBlock block; + + block = (AllocBlock) malloc(blksize); + if (block == NULL) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (ENOMEM, + errmsg("out of memory"), + errdetail("Failed while creating memory context \"%s\".", + name))); + } + block->aset = context; + block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ; + block->endptr = ((char *) block) + blksize; + block->next = context->blocks; + context->blocks = block; + /* Mark block as not to be released at reset time */ + context->keeper = block; + } + + context->isReset = true; + context->header.is_shared = isShared; + if (isShared) + GTM_RWLockInit(&context->header.lock); + + return (MemoryContext) context; +} + +/* + * AllocSetInit + * Context-type-specific initialization routine. + * + * This is called by MemoryContextCreate() after setting up the + * generic MemoryContext fields and before linking the new context + * into the context tree. We must do whatever is needed to make the + * new context minimally valid for deletion. We must *not* risk + * failure --- thus, for example, allocating more memory is not cool. + * (AllocSetContextCreate can allocate memory when it gets control + * back, however.) + */ +static void +AllocSetInit(MemoryContext context) +{ + /* + * Since MemoryContextCreate already zeroed the context node, we don't + * have to do anything here: it's already OK. + */ +} + +/* + * AllocSetReset + * Frees all memory which is allocated in the given set. + * + * Actually, this routine has some discretion about what to do. + * It should mark all allocated chunks freed, but it need not necessarily + * give back all the resources the set owns. Our actual implementation is + * that we hang onto any "keeper" block specified for the set. In this way, + * we don't thrash malloc() when a context is repeatedly reset after small + * allocations, which is typical behavior for per-tuple contexts. + */ +static void +AllocSetReset(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + AllocBlock block; + + AssertArg(AllocSetIsValid(set)); + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + + /* Nothing to do if no pallocs since startup or last reset */ + if (set->isReset) + { + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return; + } + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + AllocSetCheck(context); +#endif + + /* Clear chunk freelists */ + MemSetAligned(set->freelist, 0, sizeof(set->freelist)); + + block = set->blocks; + + /* New blocks list is either empty or just the keeper block */ + set->blocks = set->keeper; + + while (block != NULL) + { + AllocBlock next = block->next; + + if (block == set->keeper) + { + /* Reset the block, but don't return it to malloc */ + char *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ; + +#ifdef CLOBBER_FREED_MEMORY + /* Wipe freed memory for debugging purposes */ + memset(datastart, 0x7F, block->freeptr - datastart); +#endif + block->freeptr = datastart; + block->next = NULL; + } + else + { + /* Normal case, release the block */ +#ifdef CLOBBER_FREED_MEMORY + /* Wipe freed memory for debugging purposes */ + memset(block, 0x7F, block->freeptr - ((char *) block)); +#endif + free(block); + } + block = next; + } + + /* Reset block size allocation sequence, too */ + set->nextBlockSize = set->initBlockSize; + + set->isReset = true; + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); +} + +/* + * AllocSetDelete + * Frees all memory which is allocated in the given set, + * in preparation for deletion of the set. + * + * Unlike AllocSetReset, this *must* free all resources of the set. + * But note we are not responsible for deleting the context node itself. + */ +static void +AllocSetDelete(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + AllocBlock block = set->blocks; + + AssertArg(AllocSetIsValid(set)); + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + AllocSetCheck(context); +#endif + + /* Make it look empty, just in case... */ + MemSetAligned(set->freelist, 0, sizeof(set->freelist)); + set->blocks = NULL; + set->keeper = NULL; + + while (block != NULL) + { + AllocBlock next = block->next; + +#ifdef CLOBBER_FREED_MEMORY + /* Wipe freed memory for debugging purposes */ + memset(block, 0x7F, block->freeptr - ((char *) block)); +#endif + free(block); + block = next; + } + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); +} + +/* + * AllocSetAlloc + * Returns pointer to allocated memory of given size; memory is added + * to the set. + */ +static void * +AllocSetAlloc(MemoryContext context, Size size) +{ + AllocSet set = (AllocSet) context; + AllocBlock block; + AllocChunk chunk; + int fidx; + Size chunk_size; + Size blksize; + + AssertArg(AllocSetIsValid(set)); + + /* + * If this is a shared context, make it thread safe by acquiring + * appropriate lock + */ + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + + /* + * If requested size exceeds maximum for chunks, allocate an entire block + * for this request. + */ + if (size > set->allocChunkLimit) + { + chunk_size = MAXALIGN(size); + blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + block = (AllocBlock) malloc(blksize); + if (block == NULL) + { + MemoryContextStats(TopMemoryContext); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + ereport(ERROR, + (ENOMEM, + errmsg("out of memory"), + errdetail("Failed on request of size %lu.", + (unsigned long) size))); + } + block->aset = set; + block->freeptr = block->endptr = ((char *) block) + blksize; + + chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ); + chunk->aset = set; + chunk->size = chunk_size; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk_size) + ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E; +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + /* + * Stick the new block underneath the active allocation block, so that + * we don't lose the use of the space remaining therein. + */ + if (set->blocks != NULL) + { + block->next = set->blocks->next; + set->blocks->next = block; + } + else + { + block->next = NULL; + set->blocks = block; + } + + set->isReset = false; + + AllocAllocInfo(set, chunk); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return AllocChunkGetPointer(chunk); + } + + /* + * Request is small enough to be treated as a chunk. Look in the + * corresponding free list to see if there is a free chunk we could reuse. + * If one is found, remove it from the free list, make it again a member + * of the alloc set and return its data address. + */ + fidx = AllocSetFreeIndex(size); + chunk = set->freelist[fidx]; + if (chunk != NULL) + { + Assert(chunk->size >= size); + + set->freelist[fidx] = (AllocChunk) chunk->aset; + + chunk->aset = (void *) set; + +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E; +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + /* isReset must be false already */ + Assert(!set->isReset); + + AllocAllocInfo(set, chunk); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return AllocChunkGetPointer(chunk); + } + + /* + * Choose the actual chunk size to allocate. + */ + chunk_size = (1 << ALLOC_MINBITS) << fidx; + Assert(chunk_size >= size); + + /* + * If there is enough room in the active allocation block, we will put the + * chunk into that block. Else must start a new one. + */ + if ((block = set->blocks) != NULL) + { + Size availspace = block->endptr - block->freeptr; + + if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ)) + { + /* + * The existing active (top) block does not have enough room for + * the requested allocation, but it might still have a useful + * amount of space in it. Once we push it down in the block list, + * we'll never try to allocate more space from it. So, before we + * do that, carve up its free space into chunks that we can put on + * the set's freelists. + * + * Because we can only get here when there's less than + * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate + * more than ALLOCSET_NUM_FREELISTS-1 times. + */ + while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ)) + { + Size availchunk = availspace - ALLOC_CHUNKHDRSZ; + int a_fidx = AllocSetFreeIndex(availchunk); + + /* + * In most cases, we'll get back the index of the next larger + * freelist than the one we need to put this chunk on. The + * exception is when availchunk is exactly a power of 2. + */ + if (availchunk != (1 << (a_fidx + ALLOC_MINBITS))) + { + a_fidx--; + Assert(a_fidx >= 0); + availchunk = (1 << (a_fidx + ALLOC_MINBITS)); + } + + chunk = (AllocChunk) (block->freeptr); + + block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ); + availspace -= (availchunk + ALLOC_CHUNKHDRSZ); + + chunk->size = availchunk; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = 0; /* mark it free */ +#endif + chunk->aset = (void *) set->freelist[a_fidx]; + set->freelist[a_fidx] = chunk; + } + + /* Mark that we need to create a new block */ + block = NULL; + } + } + + /* + * Time to create a new regular (multi-chunk) block? + */ + if (block == NULL) + { + Size required_size; + + /* + * The first such block has size initBlockSize, and we double the + * space in each succeeding block, but not more than maxBlockSize. + */ + blksize = set->nextBlockSize; + set->nextBlockSize <<= 1; + if (set->nextBlockSize > set->maxBlockSize) + set->nextBlockSize = set->maxBlockSize; + + /* + * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more + * space... but try to keep it a power of 2. + */ + required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + while (blksize < required_size) + blksize <<= 1; + + /* Try to allocate it */ + block = (AllocBlock) malloc(blksize); + + /* + * We could be asking for pretty big blocks here, so cope if malloc + * fails. But give up if there's less than a meg or so available... + */ + while (block == NULL && blksize > 1024 * 1024) + { + blksize >>= 1; + if (blksize < required_size) + break; + block = (AllocBlock) malloc(blksize); + } + + if (block == NULL) + { + MemoryContextStats(TopMemoryContext); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + ereport(ERROR, + (ENOMEM, + errmsg("out of memory"), + errdetail("Failed on request of size %lu.", + (unsigned long) size))); + } + + block->aset = set; + block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ; + block->endptr = ((char *) block) + blksize; + + /* + * If this is the first block of the set, make it the "keeper" block. + * Formerly, a keeper block could only be created during context + * creation, but allowing it to happen here lets us have fast reset + * cycling even for contexts created with minContextSize = 0; that way + * we don't have to force space to be allocated in contexts that might + * never need any space. Don't mark an oversize block as a keeper, + * however. + */ + if (set->keeper == NULL && blksize == set->initBlockSize) + set->keeper = block; + + block->next = set->blocks; + set->blocks = block; + } + + /* + * OK, do the allocation + */ + chunk = (AllocChunk) (block->freeptr); + + block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ); + Assert(block->freeptr <= block->endptr); + + chunk->aset = (void *) set; + chunk->size = chunk_size; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E; +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + set->isReset = false; + + AllocAllocInfo(set, chunk); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return AllocChunkGetPointer(chunk); +} + +/* + * AllocSetFree + * Frees allocated memory; memory is removed from the set. + */ +static void +AllocSetFree(MemoryContext context, void *pointer) +{ + AllocSet set = (AllocSet) context; + AllocChunk chunk = AllocPointerGetChunk(pointer); + + /* + * Acquire appropriate lock for a shared memory context + */ + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + + AllocFreeInfo(set, chunk); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < chunk->size) + if (((char *) pointer)[chunk->requested_size] != 0x7E) + elog(WARNING, "detected write past chunk end in %s %p", + set->header.name, chunk); +#endif + + if (chunk->size > set->allocChunkLimit) + { + /* + * Big chunks are certain to have been allocated as single-chunk + * blocks. Find the containing block and return it to malloc(). + */ + AllocBlock block = set->blocks; + AllocBlock prevblock = NULL; + + while (block != NULL) + { + if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ)) + break; + prevblock = block; + block = block->next; + } + if (block == NULL) + { + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + elog(ERROR, "could not find block containing chunk %p", chunk); + } + /* let's just make sure chunk is the only one in the block */ + Assert(block->freeptr == ((char *) block) + + (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)); + + /* OK, remove block from aset's list and free it */ + if (prevblock == NULL) + set->blocks = block->next; + else + prevblock->next = block->next; +#ifdef CLOBBER_FREED_MEMORY + /* Wipe freed memory for debugging purposes */ + memset(block, 0x7F, block->freeptr - ((char *) block)); +#endif + free(block); + } + else + { + /* Normal case, put the chunk into appropriate freelist */ + int fidx = AllocSetFreeIndex(chunk->size); + + chunk->aset = (void *) set->freelist[fidx]; + +#ifdef CLOBBER_FREED_MEMORY + /* Wipe freed memory for debugging purposes */ + memset(pointer, 0x7F, chunk->size); +#endif + +#ifdef MEMORY_CONTEXT_CHECKING + /* Reset requested_size to 0 in chunks that are on freelist */ + chunk->requested_size = 0; +#endif + set->freelist[fidx] = chunk; + } + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); +} + +/* + * AllocSetRealloc + * Returns new pointer to allocated memory of given size; this memory + * is added to the set. Memory associated with given pointer is copied + * into the new memory, and the old memory is freed. + */ +static void * +AllocSetRealloc(MemoryContext context, void *pointer, Size size) +{ + AllocSet set = (AllocSet) context; + AllocChunk chunk = AllocPointerGetChunk(pointer); + Size oldsize = chunk->size; + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < oldsize) + if (((char *) pointer)[chunk->requested_size] != 0x7E) + elog(WARNING, "detected write past chunk end in %s %p", + set->header.name, chunk); +#endif + + /* isReset must be false already */ + Assert(!set->isReset); + + /* + * Chunk sizes are aligned to power of 2 in AllocSetAlloc(). Maybe the + * allocated area already is >= the new size. (In particular, we always + * fall out here if the requested size is a decrease.) + */ + if (oldsize >= size) + { +#ifdef MEMORY_CONTEXT_CHECKING +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* We can only fill the extra space if we know the prior request */ + if (size > chunk->requested_size) + randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size, + size - chunk->requested_size); +#endif + + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < oldsize) + ((char *) pointer)[size] = 0x7E; +#endif + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return pointer; + } + + if (oldsize > set->allocChunkLimit) + { + /* + * The chunk must have been allocated as a single-chunk block. Find + * the containing block and use realloc() to make it bigger with + * minimum space wastage. + */ + AllocBlock block = set->blocks; + AllocBlock prevblock = NULL; + Size chksize; + Size blksize; + + while (block != NULL) + { + if (chunk == (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ)) + break; + prevblock = block; + block = block->next; + } + if (block == NULL) + { + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + elog(ERROR, "could not find block containing chunk %p", chunk); + } + /* let's just make sure chunk is the only one in the block */ + Assert(block->freeptr == ((char *) block) + + (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)); + + /* Do the realloc */ + chksize = MAXALIGN(size); + blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + block = (AllocBlock) realloc(block, blksize); + if (block == NULL) + { + MemoryContextStats(TopMemoryContext); + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + ereport(ERROR, + (ENOMEM, + errmsg("out of memory"), + errdetail("Failed on request of size %lu.", + (unsigned long) size))); + } + block->freeptr = block->endptr = ((char *) block) + blksize; + + /* Update pointers since block has likely been moved */ + chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ); + if (prevblock == NULL) + set->blocks = block; + else + prevblock->next = block; + chunk->size = chksize; + +#ifdef MEMORY_CONTEXT_CHECKING +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* We can only fill the extra space if we know the prior request */ + randomize_mem((char *) AllocChunkGetPointer(chunk) + chunk->requested_size, + size - chunk->requested_size); +#endif + + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + ((char *) AllocChunkGetPointer(chunk))[size] = 0x7E; +#endif + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return AllocChunkGetPointer(chunk); + } + else + { + /* + * Small-chunk case. We just do this by brute force, ie, allocate a + * new chunk and copy the data. Since we know the existing data isn't + * huge, this won't involve any great memcpy expense, so it's not + * worth being smarter. (At one time we tried to avoid memcpy when it + * was possible to enlarge the chunk in-place, but that turns out to + * misbehave unpleasantly for repeated cycles of + * palloc/repalloc/pfree: the eventually freed chunks go into the + * wrong freelist for the next initial palloc request, and so we leak + * memory indefinitely. See pgsql-hackers archives for 2007-08-11.) + */ + AllocPointer newPointer; + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + /* allocate new chunk */ + newPointer = AllocSetAlloc((MemoryContext) set, size); + + /* transfer existing data (certain to fit) */ + memcpy(newPointer, pointer, oldsize); + + /* free old chunk */ + AllocSetFree((MemoryContext) set, pointer); + + return newPointer; + } +} + +/* + * AllocSetGetChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + */ +static Size +AllocSetGetChunkSpace(MemoryContext context, void *pointer) +{ + AllocChunk chunk = AllocPointerGetChunk(pointer); + + return chunk->size + ALLOC_CHUNKHDRSZ; +} + +/* + * AllocSetIsEmpty + * Is an allocset empty of any allocated space? + */ +static bool +AllocSetIsEmpty(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + bool ret = false; + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + /* + * For now, we say "empty" only if the context is new or just reset. We + * could examine the freelists to determine if all space has been freed, + * but it's not really worth the trouble for present uses of this + * functionality. + */ + if (set->isReset) + ret = true; + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + return ret; +} + +/* + * AllocSetStats + * Displays stats about memory consumption of an allocset. + */ +static void +AllocSetStats(MemoryContext context, int level) +{ + AllocSet set = (AllocSet) context; + long nblocks = 0; + long nchunks = 0; + long totalspace = 0; + long freespace = 0; + AllocBlock block; + AllocChunk chunk; + int fidx; + int i; + + /* + * XXX The caller is most likely holding a lock for shared contextes. So + * don't bother to lock it again (this might cause problem some time, so + * revisit this later) + */ + for (block = set->blocks; block != NULL; block = block->next) + { + nblocks++; + totalspace += block->endptr - ((char *) block); + freespace += block->endptr - block->freeptr; + } + for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++) + { + for (chunk = set->freelist[fidx]; chunk != NULL; + chunk = (AllocChunk) chunk->aset) + { + nchunks++; + freespace += chunk->size + ALLOC_CHUNKHDRSZ; + } + } + + for (i = 0; i < level; i++) + fprintf(stderr, " "); + + fprintf(stderr, + "%s: %lu total in %ld blocks; %lu free (%ld chunks); %lu used\n", + set->header.name, totalspace, nblocks, freespace, nchunks, + totalspace - freespace); +} + + +#ifdef MEMORY_CONTEXT_CHECKING + +/* + * AllocSetCheck + * Walk through chunks and check consistency of memory. + * + * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll + * find yourself in an infinite loop when trouble occurs, because this + * routine will be entered again when elog cleanup tries to release memory! + */ +static void +AllocSetCheck(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + char *name = set->header.name; + AllocBlock block; + + /* + * XXX The caller is most likely holding a lock for shared contextes. So + * don't bother to lock it again (this might cause problem some time, so + * revisit this later) + */ + for (block = set->blocks; block != NULL; block = block->next) + { + char *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ; + long blk_used = block->freeptr - bpoz; + long blk_data = 0; + long nchunks = 0; + + /* + * Empty block - empty can be keeper-block only + */ + if (!blk_used) + { + if (set->keeper != block) + elog(WARNING, "problem in alloc set %s: empty block %p", + name, block); + } + + /* + * Chunk walker + */ + while (bpoz < block->freeptr) + { + AllocChunk chunk = (AllocChunk) bpoz; + Size chsize, + dsize; + char *chdata_end; + + chsize = chunk->size; /* aligned chunk size */ + dsize = chunk->requested_size; /* real data */ + chdata_end = ((char *) chunk) + (ALLOC_CHUNKHDRSZ + dsize); + + /* + * Check chunk size + */ + if (dsize > chsize) + elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p", + name, chunk, block); + if (chsize < (1 << ALLOC_MINBITS)) + elog(WARNING, "problem in alloc set %s: bad size %lu for chunk %p in block %p", + name, (unsigned long) chsize, chunk, block); + + /* single-chunk block? */ + if (chsize > set->allocChunkLimit && + chsize + ALLOC_CHUNKHDRSZ != blk_used) + elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p", + name, chunk, block); + + /* + * If chunk is allocated, check for correct aset pointer. (If it's + * free, the aset is the freelist pointer, which we can't check as + * easily...) + */ + if (dsize > 0 && chunk->aset != (void *) set) + elog(WARNING, "problem in alloc set %s: bogus aset link in block %p, chunk %p", + name, block, chunk); + + /* + * Check for overwrite of "unallocated" space in chunk + */ + if (dsize > 0 && dsize < chsize && *chdata_end != 0x7E) + elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p", + name, block, chunk); + + blk_data += chsize; + nchunks++; + + bpoz += ALLOC_CHUNKHDRSZ + chsize; + } + + if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used) + elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p", + name, block); + } +} + +#endif /* MEMORY_CONTEXT_CHECKING */ diff --git a/src/gtm/common/assert.c b/src/gtm/common/assert.c new file mode 100644 index 0000000000..58b94481b3 --- /dev/null +++ b/src/gtm/common/assert.c @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * assert.c + * Assert code. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/error/assert.c,v 1.35 2008/01/01 19:45:53 momjian Exp $ + * + * NOTE + * This should eventually work with elog() + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/assert.h" + +#include <unistd.h> + +bool assert_enabled = false; + +/* + * ExceptionalCondition - Handles the failure of an Assert() + * + * Note: this can't actually return, but we declare it as returning int + * because the TrapMacro() macro might get wonky otherwise. + */ +int +ExceptionalCondition(const char *conditionName, + const char *errorType, + const char *fileName, + int lineNumber) +{ + if (!PointerIsValid(conditionName) + || !PointerIsValid(fileName) + || !PointerIsValid(errorType)) + fprintf(stderr, "TRAP: ExceptionalCondition: bad arguments\n"); + else + { + fprintf(stderr, "TRAP: %s(\"%s\", File: \"%s\", Line: %d)\n", + errorType, conditionName, + fileName, lineNumber); + } + + /* Usually this shouldn't be needed, but make sure the msg went out */ + fflush(stderr); + + abort(); + return 0; +} diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c new file mode 100644 index 0000000000..626dc36925 --- /dev/null +++ b/src/gtm/common/elog.c @@ -0,0 +1,1117 @@ +/*------------------------------------------------------------------------- + * + * elog.c + * error logging and reporting + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.212 2009/01/19 15:34:23 mha Exp $ + * + *------------------------------------------------------------------------- + */ +#include <fcntl.h> +#include <time.h> +#include <unistd.h> +#include <signal.h> +#include <ctype.h> +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/gtm_msg.h" +#include "gtm/stringinfo.h" +#include "gtm/memutils.h" +#include "gtm/elog.h" +#include "gtm/assert.h" +#include "gtm/gtm_ext.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" + +#undef _ +#define _(x) x + +/* + * Change this to something which is more appropriate. + * + * XXX The GTM should take command like argument to set the log file + */ +char *GTMLogFile = NULL; + +/* GUC parameters */ +int Log_destination = LOG_DESTINATION_STDERR; + +/* Macro for checking errordata_stack_depth is reasonable */ +#define CHECK_STACK_DEPTH() \ + do { \ + if (errordata_stack_depth < 0) \ + { \ + errordata_stack_depth = -1; \ + ereport(ERROR, (errmsg_internal("errstart was not called"))); \ + } \ + } while (0) + + +static void send_message_to_server_log(ErrorData *edata); +static void send_message_to_frontend(Port *myport, ErrorData *edata); +static char *expand_fmt_string(const char *fmt, ErrorData *edata); +static const char *useful_strerror(int errnum); +static const char *error_severity(int elevel); +static void append_with_tabs(StringInfo buf, const char *str); +static bool is_log_level_output(int elevel, int log_min_level); + +int log_min_messages = WARNING; +char *Log_line_prefix = "%l:%p:%m -"; /* format for extra log line info */ + +#define FORMATTED_TS_LEN 128 +static char formatted_start_time[FORMATTED_TS_LEN]; +static char formatted_log_time[FORMATTED_TS_LEN]; + +static void log_line_prefix(StringInfo buf); +static void setup_formatted_log_time(void); +/* + * setup formatted_log_time, for consistent times between CSV and regular logs + */ +static void +setup_formatted_log_time(void) +{ + struct timeval tv; + time_t stamp_time; + char msbuf[8]; + + gettimeofday(&tv, NULL); + stamp_time = (time_t) tv.tv_sec; + + strftime(formatted_log_time, FORMATTED_TS_LEN, + /* leave room for milliseconds... */ + "%Y-%m-%d %H:%M:%S %Z", + localtime(&stamp_time)); + + /* 'paste' milliseconds into place... */ + sprintf(msbuf, ".%03d", (int) (tv.tv_usec / 1000)); + strncpy(formatted_log_time + 19, msbuf, 4); +} + +/* + * Format tag info for log lines; append to the provided buffer. + */ +static void +log_line_prefix(StringInfo buf) +{ + /* static counter for line numbers */ + static long log_line_number = 0; + + /* has counter been reset in current process? */ + static int log_my_pid = 0; + + int format_len; + int i; + + /* + * This is one of the few places where we'd rather not inherit a static + * variable's value from the postmaster. But since we will, reset it when + * MyProcPid changes. MyStartTime also changes when MyProcPid does, so + * reset the formatted start timestamp too. + */ + if (log_my_pid != MyThreadID) + { + log_line_number = 0; + log_my_pid = MyThreadID; + formatted_start_time[0] = '\0'; + } + log_line_number++; + + if (Log_line_prefix == NULL) + return; /* in case guc hasn't run yet */ + + format_len = strlen(Log_line_prefix); + + for (i = 0; i < format_len; i++) + { + if (Log_line_prefix[i] != '%') + { + /* literal char, just copy */ + appendStringInfoChar(buf, Log_line_prefix[i]); + continue; + } + /* go to char after '%' */ + i++; + if (i >= format_len) + break; /* format error - ignore it */ + + /* process the option */ + switch (Log_line_prefix[i]) + { + case 'p': + appendStringInfo(buf, "%lu", MyThreadID); + break; + case 'l': + appendStringInfo(buf, "%ld", log_line_number); + break; + case 'm': + setup_formatted_log_time(); + appendStringInfoString(buf, formatted_log_time); + break; + default: + /* format error - ignore it */ + break; + } + } +} + +/* + * errstart --- begin an error-reporting cycle + * + * Create a stack entry and store the given parameters in it. Subsequently, + * errmsg() and perhaps other routines will be called to further populate + * the stack entry. Finally, errfinish() will be called to actually process + * the error report. + * + * Returns TRUE in normal case. Returns FALSE to short-circuit the error + * report (if it's a warning or lower and not to be reported anywhere). + */ +bool +errstart(int elevel, const char *filename, int lineno, + const char *funcname, const char *domain) +{ + ErrorData *edata; + bool output_to_server; + bool output_to_client = false; + int i; + + /* + * Check some cases in which we want to promote an error into a more + * severe error. None of this logic applies for non-error messages. + */ + if (elevel >= ERROR) + { + /* + * If we are inside a critical section, all errors become PANIC + * errors. See miscadmin.h. + */ + if (CritSectionCount > 0) + elevel = PANIC; + + /* + * Check reasons for treating ERROR as FATAL: + * + * 1. we have no handler to pass the error to (implies we are in the + * postmaster or in backend startup). + * + * 2. ExitOnAnyError mode switch is set (initdb uses this). + * + * 3. the error occurred after proc_exit has begun to run. (It's + * proc_exit's responsibility to see that this doesn't turn into + * infinite recursion!) + */ + if (elevel == ERROR) + { + if (PG_exception_stack == NULL) + elevel = FATAL; + } + + /* + * If the error level is ERROR or more, errfinish is not going to + * return to caller; therefore, if there is any stacked error already + * in progress it will be lost. This is more or less okay, except we + * do not want to have a FATAL or PANIC error downgraded because the + * reporting process was interrupted by a lower-grade error. So check + * the stack and make sure we panic if panic is warranted. + */ + for (i = 0; i <= errordata_stack_depth; i++) + elevel = Max(elevel, errordata[i].elevel); + } + + output_to_server = is_log_level_output(elevel, log_min_messages); + output_to_client = (elevel >= ERROR); + + /* Skip processing effort if non-error message will not be output */ + if (elevel < ERROR && !output_to_server && !output_to_client) + return false; + + /* + * Okay, crank up a stack entry to store the info in. + */ + + if (recursion_depth++ > 0 && elevel >= ERROR) + { + /* + * Ooops, error during error processing. Clear ErrorContext as + * discussed at top of file. We will not return to the original + * error's reporter or handler, so we don't need it. + */ + MemoryContextReset(ErrorContext); + } + + if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE) + { + /* + * Wups, stack not big enough. We treat this as a PANIC condition + * because it suggests an infinite loop of errors during error + * recovery. + */ + errordata_stack_depth = -1; /* make room on stack */ + ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded"))); + } + /* Initialize data for this error frame */ + edata = &errordata[errordata_stack_depth]; + MemSet(edata, 0, sizeof(ErrorData)); + edata->elevel = elevel; + edata->output_to_server = output_to_server; + edata->output_to_client = output_to_client; + edata->filename = filename; + edata->lineno = lineno; + edata->funcname = funcname; + /* errno is saved here so that error parameter eval can't change it */ + edata->saved_errno = errno; + + recursion_depth--; + return true; +} + +/* + * errfinish --- end an error-reporting cycle + * + * Produce the appropriate error report(s) and pop the error stack. + * + * If elevel is ERROR or worse, control does not return to the caller. + * See elog.h for the error level definitions. + */ +void +errfinish(int dummy,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + int elevel = edata->elevel; + + MemoryContext oldcontext; + recursion_depth++; + CHECK_STACK_DEPTH(); + + /* + * Do processing in ErrorContext, which we hope has enough reserved space + * to report an error. + */ + oldcontext = MemoryContextSwitchTo(ErrorContext); + + + /* + * If ERROR (not more nor less) we pass it off to the current handler. + * Printing it and popping the stack is the responsibility of the handler. + */ + if (elevel == ERROR) + { + /* + * We do some minimal cleanup before longjmp'ing so that handlers can + * execute in a reasonably sane state. + */ + CritSectionCount = 0; /* should be unnecessary, but... */ + + /* + * Note that we leave CurrentMemoryContext set to ErrorContext. The + * handler should reset it to something else soon. + */ + + recursion_depth--; + PG_RE_THROW(); + } + + /* Emit the message to the right places */ + EmitErrorReport(MyPort); + + /* Now free up subsidiary data attached to stack entry, and release it */ + if (edata->message) + pfree(edata->message); + if (edata->detail) + pfree(edata->detail); + if (edata->detail_log) + pfree(edata->detail_log); + if (edata->hint) + pfree(edata->hint); + if (edata->context) + pfree(edata->context); + errordata_stack_depth--; + + /* Exit error-handling context */ + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + + /* + * Perform error recovery action as specified by elevel. + */ + if (elevel == FATAL) + { + /* + * fflush here is just to improve the odds that we get to see the + * error message, in case things are so hosed that proc_exit crashes. + * Any other code you might be tempted to add here should probably be + * in an on_proc_exit or on_shmem_exit callback instead. + */ + fflush(stdout); + fflush(stderr); + + /* + * Do normal process-exit cleanup, then return exit code 1 to indicate + * FATAL termination. The postmaster may or may not consider this + * worthy of panic, depending on which subprocess returns it. + */ + pthread_exit(NULL); + } + + if (elevel >= PANIC) + { + fflush(stdout); + fflush(stderr); + abort(); + } + + /* + * We reach here if elevel <= WARNING. OK to return to caller. + */ +} + +/* + * This macro handles expansion of a format string and associated parameters; + * it's common code for errmsg(), errdetail(), etc. Must be called inside + * a routine that is declared like "const char *fmt, ..." and has an edata + * pointer set up. The message is assigned to edata->targetfield, or + * appended to it if appendval is true. The message is subject to translation + * if translateit is true. + * + * Note: we pstrdup the buffer rather than just transferring its storage + * to the edata field because the buffer might be considerably larger than + * really necessary. + */ +#define EVALUATE_MESSAGE(targetfield, appendval, translateit) \ + { \ + char *fmtbuf; \ + StringInfoData buf; \ + /* Expand %m in format string */ \ + fmtbuf = expand_fmt_string(fmt, edata); \ + initStringInfo(&buf); \ + if ((appendval) && edata->targetfield) \ + appendStringInfo(&buf, "%s\n", edata->targetfield); \ + /* Generate actual output --- have to use appendStringInfoVA */ \ + for (;;) \ + { \ + va_list args; \ + bool success; \ + va_start(args, fmt); \ + success = appendStringInfoVA(&buf, fmtbuf, args); \ + va_end(args); \ + if (success) \ + break; \ + enlargeStringInfo(&buf, buf.maxlen); \ + } \ + /* Done with expanded fmt */ \ + pfree(fmtbuf); \ + /* Save the completed message into the stack item */ \ + if (edata->targetfield) \ + pfree(edata->targetfield); \ + edata->targetfield = pstrdup(buf.data); \ + pfree(buf.data); \ + } + + +/* + * errmsg --- add a primary error message text to the current error + * + * In addition to the usual %-escapes recognized by printf, "%m" in + * fmt is replaced by the error message for the caller's value of errno. + * + * Note: no newline is needed at the end of the fmt string, since + * ereport will provide one for the output methods that need it. + */ +int +errmsg(const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(message, false, true); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + return 0; /* return value does not matter */ +} + + +/* + * errmsg_internal --- add a primary error message text to the current error + * + * This is exactly like errmsg() except that strings passed to errmsg_internal + * are not translated, and are customarily left out of the + * internationalization message dictionary. This should be used for "can't + * happen" cases that are probably not worth spending translation effort on. + * We also use this for certain cases where we *must* not try to translate + * the message because the translation would fail and result in infinite + * error recursion. + */ +int +errmsg_internal(const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(message, false, false); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + return 0; /* return value does not matter */ +} + + +/* + * errdetail --- add a detail error message text to the current error + */ +int +errdetail(const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(detail, false, true); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + return 0; /* return value does not matter */ +} + + +/* + * errdetail_log --- add a detail_log error message text to the current error + */ +int +errdetail_log(const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(detail_log, false, true); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + return 0; /* return value does not matter */ +} + + +/* + * errhint --- add a hint error message text to the current error + */ +int +errhint(const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(hint, false, true); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + return 0; /* return value does not matter */ +} + + + +/* + * errfunction --- add reporting function name to the current error + * + * This is used when backwards compatibility demands that the function + * name appear in messages sent to old-protocol clients. Note that the + * passed string is expected to be a non-freeable constant string. + */ +int +errfunction(const char *funcname) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + + + edata->funcname = funcname; + edata->show_funcname = true; + + return 0; /* return value does not matter */ +} + + +/* + * elog_start --- startup for old-style API + * + * All that we do here is stash the hidden filename/lineno/funcname + * arguments into a stack entry. + * + * We need this to be separate from elog_finish because there's no other + * portable way to deal with inserting extra arguments into the elog call. + * (If macros with variable numbers of arguments were portable, it'd be + * easy, but they aren't.) + */ +void +elog_start(const char *filename, int lineno, const char *funcname) +{ + ErrorData *edata; + + if (++errordata_stack_depth >= ERRORDATA_STACK_SIZE) + { + /* + * Wups, stack not big enough. We treat this as a PANIC condition + * because it suggests an infinite loop of errors during error + * recovery. Note that the message is intentionally not localized, + * else failure to convert it to client encoding could cause further + * recursion. + */ + errordata_stack_depth = -1; /* make room on stack */ + ereport(PANIC, (errmsg_internal("ERRORDATA_STACK_SIZE exceeded"))); + } + + edata = &errordata[errordata_stack_depth]; + edata->filename = filename; + edata->lineno = lineno; + edata->funcname = funcname; + /* errno is saved now so that error parameter eval can't change it */ + edata->saved_errno = errno; +} + +/* + * elog_finish --- finish up for old-style API + */ +void +elog_finish(int elevel, const char *fmt,...) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + MemoryContext oldcontext; + + CHECK_STACK_DEPTH(); + + /* + * Do errstart() to see if we actually want to report the message. + */ + errordata_stack_depth--; + errno = edata->saved_errno; + if (!errstart(elevel, edata->filename, edata->lineno, edata->funcname, NULL)) + return; /* nothing to do */ + + /* + * Format error message just like errmsg_internal(). + */ + recursion_depth++; + oldcontext = MemoryContextSwitchTo(ErrorContext); + + EVALUATE_MESSAGE(message, false, false); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; + + /* + * And let errfinish() finish up. + */ + errfinish(0); +} + +/* + * Actual output of the top-of-stack error message + * + * In the ereport(ERROR) case this is called from GTM_ThreadMain(or not at all, + * if the error is caught by somebody). For all other severity levels this + * is called by errfinish. + */ +void +EmitErrorReport(void *argp) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + Port *myport= (Port *)argp; + MemoryContext oldcontext; + + recursion_depth++; + CHECK_STACK_DEPTH(); + oldcontext = MemoryContextSwitchTo(ErrorContext); + + /* Send to server log, if enabled */ + if (edata->output_to_server) + send_message_to_server_log(edata); + + /* Send to client, if enabled */ + if ((edata->output_to_client) && (myport != NULL)) + send_message_to_frontend(myport, edata); + + MemoryContextSwitchTo(oldcontext); + recursion_depth--; +} + +/* + * FlushErrorState --- flush the error state after error recovery + * + * This should be called by an error handler after it's done processing + * the error; or as soon as it's done CopyErrorData, if it intends to + * do stuff that is likely to provoke another error. You are not "out" of + * the error subsystem until you have done this. + */ +void +FlushErrorState(void) +{ + /* + * Reset stack to empty. The only case where it would be more than one + * deep is if we serviced an error that interrupted construction of + * another message. We assume control escaped out of that message + * construction and won't ever go back. + */ + errordata_stack_depth = -1; + recursion_depth = 0; + /* Delete all data in ErrorContext */ + MemoryContextResetAndDeleteChildren(ErrorContext); +} + + + +/* + * pg_re_throw --- out-of-line implementation of PG_RE_THROW() macro + */ +void +pg_re_throw(void) +{ + /* If possible, throw the error to the next outer setjmp handler */ + if (PG_exception_stack != NULL) + siglongjmp(*PG_exception_stack, 1); + else + { + /* + * If we get here, elog(ERROR) was thrown inside a PG_TRY block, which + * we have now exited only to discover that there is no outer setjmp + * handler to pass the error to. Had the error been thrown outside + * the block to begin with, we'd have promoted the error to FATAL, so + * the correct behavior is to make it FATAL now; that is, emit it and + * then call proc_exit. + */ + ErrorData *edata = &errordata[errordata_stack_depth]; + + Assert(errordata_stack_depth >= 0); + Assert(edata->elevel == ERROR); + edata->elevel = FATAL; + + /* + * At least in principle, the increase in severity could have changed + * where-to-output decisions, so recalculate. This should stay in + * sync with errstart(), which see for comments. + */ + edata->output_to_server = is_log_level_output(FATAL, + log_min_messages); + edata->output_to_client = true; + errfinish(0); + } + + /* We mustn't return... */ + ExceptionalCondition("pg_re_throw tried to return", "FailedAssertion", + __FILE__, __LINE__); + + /* + * Since ExceptionalCondition isn't declared noreturn because of + * TrapMacro(), we need this to keep gcc from complaining. + */ + abort(); +} + + +/* + * Initialization of error output file + */ +void +DebugFileOpen(void) +{ + int fd, + istty; + + if (GTMLogFile[0]) + { + /* + * A debug-output file name was given. + * + * Make sure we can write the file, and find out if it's a tty. + */ + if ((fd = open(GTMLogFile, O_CREAT | O_APPEND | O_WRONLY, + 0666)) < 0) + ereport(FATAL, + (errno, + errmsg("could not open file \"%s\": %m", GTMLogFile))); + istty = isatty(fd); + close(fd); + + /* + * Redirect our stderr to the debug output file. + */ + if (!freopen(GTMLogFile, "a", stderr)) + ereport(FATAL, + (errno, + errmsg("could not reopen file \"%s\" as stderr: %m", + GTMLogFile))); + + /* + * If the file is a tty and we're running under the postmaster, try to + * send stdout there as well (if it isn't a tty then stderr will block + * out stdout, so we may as well let stdout go wherever it was going + * before). + */ + if (istty) + if (!freopen(GTMLogFile, "a", stdout)) + ereport(FATAL, + (errno, + errmsg("could not reopen file \"%s\" as stdout: %m", + GTMLogFile))); + } +} + +/* + * Write error report to server's log + */ +static void +send_message_to_server_log(ErrorData *edata) +{ + StringInfoData buf; + + initStringInfo(&buf); + + formatted_log_time[0] = '\0'; + + log_line_prefix(&buf); + appendStringInfo(&buf, "%s: ", error_severity(edata->elevel)); + + if (edata->message) + append_with_tabs(&buf, edata->message); + else + append_with_tabs(&buf, _("missing error text")); + + appendStringInfoChar(&buf, '\n'); + + if (edata->detail_log) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("DETAIL: ")); + append_with_tabs(&buf, edata->detail_log); + appendStringInfoChar(&buf, '\n'); + } + else if (edata->detail) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("DETAIL: ")); + append_with_tabs(&buf, edata->detail); + appendStringInfoChar(&buf, '\n'); + } + if (edata->hint) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("HINT: ")); + append_with_tabs(&buf, edata->hint); + appendStringInfoChar(&buf, '\n'); + } + if (edata->context) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("CONTEXT: ")); + append_with_tabs(&buf, edata->context); + appendStringInfoChar(&buf, '\n'); + } + + /* assume no newlines in funcname or filename... */ + if (edata->funcname && edata->filename) + { + appendStringInfo(&buf, _("LOCATION: %s, %s:%d\n"), + edata->funcname, edata->filename, + edata->lineno); + } + else if (edata->filename) + { + appendStringInfo(&buf, _("LOCATION: %s:%d\n"), + edata->filename, edata->lineno); + } + + /* Write to stderr, if enabled */ + if (Log_destination & LOG_DESTINATION_STDERR) + write(fileno(stderr), buf.data, buf.len); + + pfree(buf.data); +} + +/* + * Write error report to client + */ +static void +send_message_to_frontend(Port *myport, ErrorData *edata) +{ + StringInfoData msgbuf; + + /* 'N' (Notice) is for nonfatal conditions, 'E' is for errors */ + pq_beginmessage(&msgbuf, (edata->elevel < ERROR) ? 'N' : 'E'); + + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + + proxyhdr.ph_conid = myport->conn_id; + /* Send the GTM Proxy header if we are dealing with a proxy */ + pq_sendbytes(&msgbuf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + + pq_sendbyte(&msgbuf, PG_DIAG_SEVERITY); + pq_sendstring(&msgbuf, error_severity(edata->elevel)); + + /* M field is required per protocol, so always send something */ + pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_PRIMARY); + if (edata->message) + pq_sendstring(&msgbuf, edata->message); + else + pq_sendstring(&msgbuf, _("missing error text")); + + if (edata->detail) + { + pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_DETAIL); + pq_sendstring(&msgbuf, edata->detail); + } + + /* detail_log is intentionally not used here */ + + if (edata->hint) + { + pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_HINT); + pq_sendstring(&msgbuf, edata->hint); + } + + pq_sendbyte(&msgbuf, '\0'); /* terminator */ + + pq_endmessage(myport, &msgbuf); + + /* + * This flush is normally not necessary, since postgres.c will flush out + * waiting data when control returns to the main loop. But it seems best + * to leave it here, so that the client has some clue what happened if the + * backend dies before getting back to the main loop ... error/notice + * messages should not be a performance-critical path anyway, so an extra + * flush won't hurt much ... + */ + pq_flush(myport); +} + +/* + * Support routines for formatting error messages. + */ + + +/* + * expand_fmt_string --- process special format codes in a format string + * + * We must replace %m with the appropriate strerror string, since vsnprintf + * won't know what to do with it. + * + * The result is a palloc'd string. + */ +static char * +expand_fmt_string(const char *fmt, ErrorData *edata) +{ + StringInfoData buf; + const char *cp; + + initStringInfo(&buf); + + for (cp = fmt; *cp; cp++) + { + if (cp[0] == '%' && cp[1] != '\0') + { + cp++; + if (*cp == 'm') + { + /* + * Replace %m by system error string. If there are any %'s in + * the string, we'd better double them so that vsnprintf won't + * misinterpret. + */ + const char *cp2; + + cp2 = useful_strerror(edata->saved_errno); + for (; *cp2; cp2++) + { + if (*cp2 == '%') + appendStringInfoCharMacro(&buf, '%'); + appendStringInfoCharMacro(&buf, *cp2); + } + } + else + { + /* copy % and next char --- this avoids trouble with %%m */ + appendStringInfoCharMacro(&buf, '%'); + appendStringInfoCharMacro(&buf, *cp); + } + } + else + appendStringInfoCharMacro(&buf, *cp); + } + + return buf.data; +} + + +/* + * A slightly cleaned-up version of strerror() + */ +static const char * +useful_strerror(int errnum) +{ + /* this buffer is only used if errno has a bogus value */ + static char errorstr_buf[48]; + const char *str; + + str = strerror(errnum); + + /* + * Some strerror()s return an empty string for out-of-range errno. This is + * ANSI C spec compliant, but not exactly useful. + */ + if (str == NULL || *str == '\0') + { + snprintf(errorstr_buf, sizeof(errorstr_buf), + /*------ + translator: This string will be truncated at 47 + characters expanded. */ + _("operating system error %d"), errnum); + str = errorstr_buf; + } + + return str; +} + + +/* + * error_severity --- get localized string representing elevel + */ +static const char * +error_severity(int elevel) +{ + const char *prefix; + + switch (elevel) + { + case DEBUG1: + case DEBUG2: + case DEBUG3: + case DEBUG4: + case DEBUG5: + prefix = _("DEBUG"); + break; + case LOG: + case COMMERROR: + prefix = _("LOG"); + break; + case INFO: + prefix = _("INFO"); + break; + case NOTICE: + prefix = _("NOTICE"); + break; + case WARNING: + prefix = _("WARNING"); + break; + case ERROR: + prefix = _("ERROR"); + break; + case ERROR2: + prefix = _("ERROR2"); + break; + case FATAL: + prefix = _("FATAL"); + break; + case PANIC: + prefix = _("PANIC"); + break; + default: + prefix = "???"; + break; + } + + return prefix; +} + + +/* + * append_with_tabs + * + * Append the string to the StringInfo buffer, inserting a tab after any + * newline. + */ +static void +append_with_tabs(StringInfo buf, const char *str) +{ + char ch; + + while ((ch = *str++) != '\0') + { + appendStringInfoCharMacro(buf, ch); + if (ch == '\n') + appendStringInfoCharMacro(buf, '\t'); + } +} + + +/* + * Write errors to stderr (or by equal means when stderr is + * not available). Used before ereport/elog can be used + * safely (memory context, GUC load etc) + */ +void +write_stderr(const char *fmt,...) +{ + va_list ap; + + fmt = _(fmt); + + va_start(ap, fmt); + + /* On Unix, we just fprintf to stderr */ + vfprintf(stderr, fmt, ap); + fflush(stderr); + va_end(ap); +} + + +/* + * is_log_level_output -- is elevel logically >= log_min_level? + * + * We use this for tests that should consider LOG to sort out-of-order, + * between ERROR and FATAL. Generally this is the right thing for testing + * whether a message should go to the postmaster log, whereas a simple >= + * test is correct for testing whether the message should go to the client. + */ +static bool +is_log_level_output(int elevel, int log_min_level) +{ + if (elevel == LOG || elevel == COMMERROR) + { + if (log_min_level == LOG || log_min_level <= ERROR) + return true; + } + else if (log_min_level == LOG) + { + /* elevel != LOG */ + if (elevel >= FATAL) + return true; + } + /* Neither is LOG */ + else if (elevel >= log_min_level) + return true; + + return false; +} diff --git a/src/gtm/common/gtm_list.c b/src/gtm/common/gtm_list.c new file mode 100644 index 0000000000..3ea2ce76cb --- /dev/null +++ b/src/gtm/common/gtm_list.c @@ -0,0 +1,863 @@ +/*------------------------------------------------------------------------- + * + * gtm_list.c + * implementation for PostgreSQL generic linked list package + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/nodes/list.c,v 1.70 2008/08/14 18:47:58 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/gtm_list.h" +#include "gtm/memutils.h" +#include "gtm/assert.h" + +#define equal(a, b) ((a) == (b)) + + +#ifdef USE_ASSERT_CHECKING +/* + * Check that the specified List is valid (so far as we can tell). + */ +static void +check_list_invariants(List *list) +{ + if (list == NIL) + return; + + Assert(list->length > 0); + Assert(list->head != NULL); + Assert(list->tail != NULL); + + if (list->length == 1) + Assert(list->head == list->tail); + if (list->length == 2) + Assert(list->head->next == list->tail); + Assert(list->tail->next == NULL); +} +#else +#define check_list_invariants(l) +#endif /* USE_ASSERT_CHECKING */ + +/* + * Return a freshly allocated List. Since empty non-NIL lists are + * invalid, new_list() also allocates the head cell of the new list: + * the caller should be sure to fill in that cell's data. + */ +static List * +new_list() +{ + List *new_list; + ListCell *new_head; + + new_head = (ListCell *) palloc(sizeof(*new_head)); + new_head->next = NULL; + /* new_head->data is left undefined! */ + + new_list = (List *) palloc(sizeof(*new_list)); + new_list->length = 1; + new_list->head = new_head; + new_list->tail = new_head; + + return new_list; +} + +/* + * Allocate a new cell and make it the head of the specified + * list. Assumes the list it is passed is non-NIL. + * + * The data in the new head cell is undefined; the caller should be + * sure to fill it in + */ +static void +new_head_cell(List *list) +{ + ListCell *new_head; + + new_head = (ListCell *) palloc(sizeof(*new_head)); + new_head->next = list->head; + + list->head = new_head; + list->length++; +} + +/* + * Allocate a new cell and make it the tail of the specified + * list. Assumes the list it is passed is non-NIL. + * + * The data in the new tail cell is undefined; the caller should be + * sure to fill it in + */ +static void +new_tail_cell(List *list) +{ + ListCell *new_tail; + + new_tail = (ListCell *) palloc(sizeof(*new_tail)); + new_tail->next = NULL; + + list->tail->next = new_tail; + list->tail = new_tail; + list->length++; +} + +/* + * Append a pointer to the list. A pointer to the modified list is + * returned. Note that this function may or may not destructively + * modify the list; callers should always use this function's return + * value, rather than continuing to use the pointer passed as the + * first argument. + */ +List * +lappend(List *list, void *datum) +{ + if (list == NIL) + list = new_list(); + else + new_tail_cell(list); + + lfirst(list->tail) = datum; + check_list_invariants(list); + return list; +} + +/* + * Add a new cell to the list, in the position after 'prev_cell'. The + * data in the cell is left undefined, and must be filled in by the + * caller. 'list' is assumed to be non-NIL, and 'prev_cell' is assumed + * to be non-NULL and a member of 'list'. + */ +static ListCell * +add_new_cell(List *list, ListCell *prev_cell) +{ + ListCell *new_cell; + + new_cell = (ListCell *) palloc(sizeof(*new_cell)); + /* new_cell->data is left undefined! */ + new_cell->next = prev_cell->next; + prev_cell->next = new_cell; + + if (list->tail == prev_cell) + list->tail = new_cell; + + list->length++; + + return new_cell; +} + +/* + * Add a new cell to the specified list (which must be non-NIL); + * it will be placed after the list cell 'prev' (which must be + * non-NULL and a member of 'list'). The data placed in the new cell + * is 'datum'. The newly-constructed cell is returned. + */ +ListCell * +lappend_cell(List *list, ListCell *prev, void *datum) +{ + ListCell *new_cell; + + new_cell = add_new_cell(list, prev); + lfirst(new_cell) = datum; + check_list_invariants(list); + return new_cell; +} + +/* + * Prepend a new element to the list. A pointer to the modified list + * is returned. Note that this function may or may not destructively + * modify the list; callers should always use this function's return + * value, rather than continuing to use the pointer passed as the + * second argument. + */ +List * +lcons(void *datum, List *list) +{ + if (list == NIL) + list = new_list(); + else + new_head_cell(list); + + lfirst(list->head) = datum; + check_list_invariants(list); + return list; +} + +/* + * Concatenate list2 to the end of list1, and return list1. list1 is + * destructively changed. Callers should be sure to use the return + * value as the new pointer to the concatenated list: the 'list1' + * input pointer may or may not be the same as the returned pointer. + * + * The nodes in list2 are merely appended to the end of list1 in-place + * (i.e. they aren't copied; the two lists will share some of the same + * storage). Therefore, invoking list_free() on list2 will also + * invalidate a portion of list1. + */ +List * +list_concat(List *list1, List *list2) +{ + if (list1 == NIL) + return list2; + if (list2 == NIL) + return list1; + if (list1 == list2) + elog(ERROR, "cannot list_concat() a list to itself"); + + + list1->length += list2->length; + list1->tail->next = list2->head; + list1->tail = list2->tail; + + check_list_invariants(list1); + return list1; +} + +/* + * Truncate 'list' to contain no more than 'new_size' elements. This + * modifies the list in-place! Despite this, callers should use the + * pointer returned by this function to refer to the newly truncated + * list -- it may or may not be the same as the pointer that was + * passed. + * + * Note that any cells removed by list_truncate() are NOT pfree'd. + */ +List * +list_truncate(List *list, int new_size) +{ + ListCell *cell; + int n; + + if (new_size <= 0) + return NIL; /* truncate to zero length */ + + /* If asked to effectively extend the list, do nothing */ + if (new_size >= list_length(list)) + return list; + + n = 1; + foreach(cell, list) + { + if (n == new_size) + { + cell->next = NULL; + list->tail = cell; + list->length = new_size; + check_list_invariants(list); + return list; + } + n++; + } + + /* keep the compiler quiet; never reached */ + Assert(false); + return list; +} + +/* + * Locate the n'th cell (counting from 0) of the list. It is an assertion + * failure if there is no such cell. + */ +static ListCell * +list_nth_cell(List *list, int n) +{ + ListCell *match; + + Assert(list != NIL); + Assert(n >= 0); + Assert(n < list->length); + check_list_invariants(list); + + /* Does the caller actually mean to fetch the tail? */ + if (n == list->length - 1) + return list->tail; + + for (match = list->head; n-- > 0; match = match->next) + ; + + return match; +} + +/* + * Return the data value contained in the n'th element of the + * specified list. (List elements begin at 0.) + */ +void * +list_nth(List *list, int n) +{ + return lfirst(list_nth_cell(list, n)); +} + +/* + * Return true iff 'datum' is a member of the list. Equality is + * determined via equal(), so callers should ensure that they pass a + * Node as 'datum'. + */ +bool +list_member(List *list, void *datum) +{ + ListCell *cell; + + check_list_invariants(list); + + foreach(cell, list) + { + if (equal(lfirst(cell), datum)) + return true; + } + + return false; +} + +/* + * Return true iff 'datum' is a member of the list. Equality is + * determined by using simple pointer comparison. + */ +bool +list_member_ptr(List *list, void *datum) +{ + ListCell *cell; + + check_list_invariants(list); + + foreach(cell, list) + { + if (lfirst(cell) == datum) + return true; + } + + return false; +} + +/* + * Delete 'cell' from 'list'; 'prev' is the previous element to 'cell' + * in 'list', if any (i.e. prev == NULL iff list->head == cell) + * + * The cell is pfree'd, as is the List header if this was the last member. + */ +List * +list_delete_cell(List *list, ListCell *cell, ListCell *prev) +{ + check_list_invariants(list); + Assert(prev != NULL ? lnext(prev) == cell : list_head(list) == cell); + + /* + * If we're about to delete the last node from the list, free the whole + * list instead and return NIL, which is the only valid representation of + * a zero-length list. + */ + if (list->length == 1) + { + list_free(list); + return NIL; + } + + /* + * Otherwise, adjust the necessary list links, deallocate the particular + * node we have just removed, and return the list we were given. + */ + list->length--; + + if (prev) + prev->next = cell->next; + else + list->head = cell->next; + + if (list->tail == cell) + list->tail = prev; + + pfree(cell); + return list; +} + +/* + * Delete the first cell in list that matches datum, if any. + * Equality is determined via equal(). + */ +List * +list_delete(List *list, void *datum) +{ + ListCell *cell; + ListCell *prev; + + check_list_invariants(list); + + prev = NULL; + foreach(cell, list) + { + if (equal(lfirst(cell), datum)) + return list_delete_cell(list, cell, prev); + + prev = cell; + } + + /* Didn't find a match: return the list unmodified */ + return list; +} + +/* As above, but use simple pointer equality */ +List * +list_delete_ptr(List *list, void *datum) +{ + ListCell *cell; + ListCell *prev; + + check_list_invariants(list); + + prev = NULL; + foreach(cell, list) + { + if (lfirst(cell) == datum) + return list_delete_cell(list, cell, prev); + + prev = cell; + } + + /* Didn't find a match: return the list unmodified */ + return list; +} + + +/* + * Delete the first element of the list. + * + * This is useful to replace the Lisp-y code "list = lnext(list);" in cases + * where the intent is to alter the list rather than just traverse it. + * Beware that the removed cell is freed, whereas the lnext() coding leaves + * the original list head intact if there's another pointer to it. + */ +List * +list_delete_first(List *list) +{ + check_list_invariants(list); + + if (list == NIL) + return NIL; /* would an error be better? */ + + return list_delete_cell(list, list_head(list), NULL); +} + +/* + * Generate the union of two lists. This is calculated by copying + * list1 via list_copy(), then adding to it all the members of list2 + * that aren't already in list1. + * + * Whether an element is already a member of the list is determined + * via equal(). + * + * The returned list is newly-allocated, although the content of the + * cells is the same (i.e. any pointed-to objects are not copied). + * + * NB: this function will NOT remove any duplicates that are present + * in list1 (so it only performs a "union" if list1 is known unique to + * start with). Also, if you are about to write "x = list_union(x, y)" + * you probably want to use list_concat_unique() instead to avoid wasting + * the list cells of the old x list. + * + * This function could probably be implemented a lot faster if it is a + * performance bottleneck. + */ +List * +list_union(List *list1, List *list2) +{ + List *result; + ListCell *cell; + + result = list_copy(list1); + foreach(cell, list2) + { + if (!list_member(result, lfirst(cell))) + result = lappend(result, lfirst(cell)); + } + + check_list_invariants(result); + return result; +} + +/* + * This variant of list_union() determines duplicates via simple + * pointer comparison. + */ +List * +list_union_ptr(List *list1, List *list2) +{ + List *result; + ListCell *cell; + + + result = list_copy(list1); + foreach(cell, list2) + { + if (!list_member_ptr(result, lfirst(cell))) + result = lappend(result, lfirst(cell)); + } + + check_list_invariants(result); + return result; +} + +/* + * Return a list that contains all the cells that are in both list1 and + * list2. The returned list is freshly allocated via palloc(), but the + * cells themselves point to the same objects as the cells of the + * input lists. + * + * Duplicate entries in list1 will not be suppressed, so it's only a true + * "intersection" if list1 is known unique beforehand. + * + * This variant works on lists of pointers, and determines list + * membership via equal(). Note that the list1 member will be pointed + * to in the result. + */ +List * +list_intersection(List *list1, List *list2) +{ + List *result; + ListCell *cell; + + if (list1 == NIL || list2 == NIL) + return NIL; + + result = NIL; + foreach(cell, list1) + { + if (list_member(list2, lfirst(cell))) + result = lappend(result, lfirst(cell)); + } + + check_list_invariants(result); + return result; +} + +/* + * Return a list that contains all the cells in list1 that are not in + * list2. The returned list is freshly allocated via palloc(), but the + * cells themselves point to the same objects as the cells of the + * input lists. + * + * This variant works on lists of pointers, and determines list + * membership via equal() + */ +List * +list_difference(List *list1, List *list2) +{ + ListCell *cell; + List *result = NIL; + + if (list2 == NIL) + return list_copy(list1); + + foreach(cell, list1) + { + if (!list_member(list2, lfirst(cell))) + result = lappend(result, lfirst(cell)); + } + + check_list_invariants(result); + return result; +} + +/* + * This variant of list_difference() determines list membership via + * simple pointer equality. + */ +List * +list_difference_ptr(List *list1, List *list2) +{ + ListCell *cell; + List *result = NIL; + + if (list2 == NIL) + return list_copy(list1); + + foreach(cell, list1) + { + if (!list_member_ptr(list2, lfirst(cell))) + result = lappend(result, lfirst(cell)); + } + + check_list_invariants(result); + return result; +} + +/* + * Append datum to list, but only if it isn't already in the list. + * + * Whether an element is already a member of the list is determined + * via equal(). + */ +List * +list_append_unique(List *list, void *datum) +{ + if (list_member(list, datum)) + return list; + else + return lappend(list, datum); +} + +/* + * This variant of list_append_unique() determines list membership via + * simple pointer equality. + */ +List * +list_append_unique_ptr(List *list, void *datum) +{ + if (list_member_ptr(list, datum)) + return list; + else + return lappend(list, datum); +} + +/* + * Append to list1 each member of list2 that isn't already in list1. + * + * Whether an element is already a member of the list is determined + * via equal(). + * + * This is almost the same functionality as list_union(), but list1 is + * modified in-place rather than being copied. Note also that list2's cells + * are not inserted in list1, so the analogy to list_concat() isn't perfect. + */ +List * +list_concat_unique(List *list1, List *list2) +{ + ListCell *cell; + + foreach(cell, list2) + { + if (!list_member(list1, lfirst(cell))) + list1 = lappend(list1, lfirst(cell)); + } + + check_list_invariants(list1); + return list1; +} + +/* + * This variant of list_concat_unique() determines list membership via + * simple pointer equality. + */ +List * +list_concat_unique_ptr(List *list1, List *list2) +{ + ListCell *cell; + + foreach(cell, list2) + { + if (!list_member_ptr(list1, lfirst(cell))) + list1 = lappend(list1, lfirst(cell)); + } + + check_list_invariants(list1); + return list1; +} + +/* + * Free all storage in a list, and optionally the pointed-to elements + */ +static void +list_free_private(List *list, bool deep) +{ + ListCell *cell; + + check_list_invariants(list); + + cell = list_head(list); + while (cell != NULL) + { + ListCell *tmp = cell; + + cell = lnext(cell); + if (deep) + pfree(lfirst(tmp)); + pfree(tmp); + } + + if (list) + pfree(list); +} + +/* + * Free all the cells of the list, as well as the list itself. Any + * objects that are pointed-to by the cells of the list are NOT + * free'd. + * + * On return, the argument to this function has been freed, so the + * caller would be wise to set it to NIL for safety's sake. + */ +void +list_free(List *list) +{ + list_free_private(list, false); +} + +/* + * Free all the cells of the list, the list itself, and all the + * objects pointed-to by the cells of the list (each element in the + * list must contain a pointer to a palloc()'d region of memory!) + * + * On return, the argument to this function has been freed, so the + * caller would be wise to set it to NIL for safety's sake. + */ +void +list_free_deep(List *list) +{ + /* + * A "deep" free operation only makes sense on a list of pointers. + */ + list_free_private(list, true); +} + +/* + * Return a shallow copy of the specified list. + */ +List * +list_copy(List *oldlist) +{ + List *newlist; + ListCell *newlist_prev; + ListCell *oldlist_cur; + + if (oldlist == NIL) + return NIL; + + newlist = new_list(); + newlist->length = oldlist->length; + + /* + * Copy over the data in the first cell; new_list() has already allocated + * the head cell itself + */ + newlist->head->data = oldlist->head->data; + + newlist_prev = newlist->head; + oldlist_cur = oldlist->head->next; + while (oldlist_cur) + { + ListCell *newlist_cur; + + newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur)); + newlist_cur->data = oldlist_cur->data; + newlist_prev->next = newlist_cur; + + newlist_prev = newlist_cur; + oldlist_cur = oldlist_cur->next; + } + + newlist_prev->next = NULL; + newlist->tail = newlist_prev; + + check_list_invariants(newlist); + return newlist; +} + +/* + * Return a shallow copy of the specified list, without the first N elements. + */ +List * +list_copy_tail(List *oldlist, int nskip) +{ + List *newlist; + ListCell *newlist_prev; + ListCell *oldlist_cur; + + if (nskip < 0) + nskip = 0; /* would it be better to elog? */ + + if (oldlist == NIL || nskip >= oldlist->length) + return NIL; + + newlist = new_list(); + newlist->length = oldlist->length - nskip; + + /* + * Skip over the unwanted elements. + */ + oldlist_cur = oldlist->head; + while (nskip-- > 0) + oldlist_cur = oldlist_cur->next; + + /* + * Copy over the data in the first remaining cell; new_list() has already + * allocated the head cell itself + */ + newlist->head->data = oldlist_cur->data; + + newlist_prev = newlist->head; + oldlist_cur = oldlist_cur->next; + while (oldlist_cur) + { + ListCell *newlist_cur; + + newlist_cur = (ListCell *) palloc(sizeof(*newlist_cur)); + newlist_cur->data = oldlist_cur->data; + newlist_prev->next = newlist_cur; + + newlist_prev = newlist_cur; + oldlist_cur = oldlist_cur->next; + } + + newlist_prev->next = NULL; + newlist->tail = newlist_prev; + + check_list_invariants(newlist); + return newlist; +} + +/* + * When using non-GCC compilers, we can't define these as inline + * functions in pg_list.h, so they are defined here. + * + * TODO: investigate supporting inlining for some non-GCC compilers. + */ +#ifndef __GNUC__ + +ListCell * +list_head(List *l) +{ + return l ? l->head : NULL; +} + +ListCell * +list_tail(List *l) +{ + return l ? l->tail : NULL; +} + +int +list_length(List *l) +{ + return l ? l->length : 0; +} +#endif /* ! __GNUC__ */ + +/* + * Temporary compatibility functions + * + * In order to avoid warnings for these function definitions, we need + * to include a prototype here as well as in pg_list.h. That's because + * we don't enable list API compatibility in list.c, so we + * don't see the prototypes for these functions. + */ + +/* + * Given a list, return its length. This is merely defined for the + * sake of backward compatibility: we can't afford to define a macro + * called "length", so it must be a function. New code should use the + * list_length() macro in order to avoid the overhead of a function + * call. + */ +int length(List *list); + +int +length(List *list) +{ + return list_length(list); +} diff --git a/src/gtm/common/gtm_lock.c b/src/gtm/common/gtm_lock.c new file mode 100644 index 0000000000..c919730c90 --- /dev/null +++ b/src/gtm/common/gtm_lock.c @@ -0,0 +1,206 @@ +/*------------------------------------------------------------------------- + * + * gtm_lock.c + * Handling for locks in GTM + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/gtm_lock.h" +#include "gtm/elog.h" + +/* + * Acquire the request lock. Block if the lock is not available + * + * TODO We should track the locks acquired in the thread specific context. If an + * error is thrown and cought, we don't want to keep holding to those locks + * since that would lead to a deadlock. Right now, we assume that the caller + * will appropriately catch errors and release the locks sanely. + */ +bool +GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode) +{ + int status; + + switch (mode) + { + case GTM_LOCKMODE_WRITE: + status = pthread_rwlock_wrlock(&lock->lk_lock); + break; + + case GTM_LOCKMODE_READ: + status = pthread_rwlock_rdlock(&lock->lk_lock); + break; + + default: + elog(ERROR, "Invalid lockmode"); + break; + } + + return status ? false : true; +} + +/* + * Release previously acquired lock + */ +bool +GTM_RWLockRelease(GTM_RWLock *lock) +{ + int status; + status = pthread_rwlock_unlock(&lock->lk_lock); + return status ? false : true; +} + +/* + * Initialize a lock + */ +int +GTM_RWLockInit(GTM_RWLock *lock) +{ + return pthread_rwlock_init(&lock->lk_lock, NULL); +} + +/* + * Destroy a lock + */ +int +GTM_RWLockDestroy(GTM_RWLock *lock) +{ + return pthread_rwlock_destroy(&lock->lk_lock); +} + +/* + * Conditionally acquire a lock. If the lock is not available, the function + * immediately returns without blocking. + * + * Returns true if lock is successfully acquired. Otherwise returns false + */ +bool +GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode) +{ + int status; + + switch (mode) + { + case GTM_LOCKMODE_WRITE: + status = pthread_rwlock_trywrlock(&lock->lk_lock); + break; + + case GTM_LOCKMODE_READ: + status = pthread_rwlock_tryrdlock(&lock->lk_lock); + break; + + default: + elog(ERROR, "Invalid lockmode"); + break; + } + + return status ? false : true; +} + +/* + * Initialize a mutex lock + */ +int +GTM_MutexLockInit(GTM_MutexLock *lock) +{ + return pthread_mutex_init(&lock->lk_lock, NULL); +} + +/* + * Destroy a mutex lock + */ +int +GTM_MutexLockDestroy(GTM_MutexLock *lock) +{ + return pthread_mutex_destroy(&lock->lk_lock); +} + +/* + * Acquire a mutex lock + * + * Return true if the lock is successfully acquired, else return false. + */ +bool +GTM_MutexLockAcquire(GTM_MutexLock *lock) +{ + int status = pthread_mutex_lock(&lock->lk_lock); + return status ? false : true; +} + +/* + * Release previously acquired lock + */ +bool +GTM_MutexLockRelease(GTM_MutexLock *lock) +{ + return pthread_mutex_unlock(&lock->lk_lock); +} + +/* + * Conditionally acquire a lock. If the lock is not available, the function + * immediately returns without blocking. + * + * Returns true if lock is successfully acquired. Otherwise returns false + */ +bool +GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock) +{ + int status = pthread_mutex_trylock(&lock->lk_lock); + return status ? false : true; +} + +/* + * Initialize a condition variable + */ +int +GTM_CVInit(GTM_CV *cv) +{ + return pthread_cond_init(&cv->cv_condvar, NULL); +} + +/* + * Destroy the conditional variable + */ +int +GTM_CVDestroy(GTM_CV *cv) +{ + return pthread_cond_destroy(&cv->cv_condvar); +} + +/* + * Wake up all the threads waiting on this conditional variable + */ +int +GTM_CVBcast(GTM_CV *cv) +{ + return pthread_cond_broadcast(&cv->cv_condvar); +} + +/* + * Wake up only one thread waiting on this conditional variable + */ +int +GTM_CVSignal(GTM_CV *cv) +{ + return pthread_cond_signal(&cv->cv_condvar); +} + +/* + * Wait on a conditional variable. The caller must have acquired the mutex lock + * already. + */ +int +GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock) +{ + return pthread_cond_wait(&cv->cv_condvar, &lock->lk_lock); +} + diff --git a/src/gtm/common/mcxt.c b/src/gtm/common/mcxt.c new file mode 100644 index 0000000000..9325ae3c82 --- /dev/null +++ b/src/gtm/common/mcxt.c @@ -0,0 +1,763 @@ +/*------------------------------------------------------------------------- + * + * mcxt.c + * POSTGRES memory context management code. + * + * This module handles context management operations that are independent + * of the particular kind of context being operated on. It calls + * context-type-specific operations via the function pointers in a + * context's MemoryContextMethods struct. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/mmgr/mcxt.c,v 1.65 2008/06/28 16:45:22 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + + +#include "gtm/gtm_c.h" +#include "gtm/memutils.h" +#include "gtm/elog.h" +#include "gtm/assert.h" +#include "gtm/gtm.h" + + +/***************************************************************************** + * GLOBAL MEMORY * + *****************************************************************************/ + +/* + * Standard top-level contexts. For a description of the purpose of each + * of these contexts, refer to src/backend/utils/mmgr/README + */ + +static void MemoryContextStatsInternal(MemoryContext context, int level); +static void MemoryContextDeleteInternal(MemoryContext context, bool parent_locked); + +MemoryContext TopMostMemoryContext; + +/***************************************************************************** + * EXPORTED ROUTINES * + *****************************************************************************/ + + +/* + * MemoryContextInit + * Start up the memory-context subsystem. + * + * This must be called before creating contexts or allocating memory in + * contexts. TopMemoryContext and ErrorContext are initialized here; + * other contexts must be created afterwards. + * + * In normal multi-backend operation, this is called once during + * postmaster startup, and not at all by individual backend startup + * (since the backends inherit an already-initialized context subsystem + * by virtue of being forked off the postmaster). + * + * In a standalone backend this must be called during backend startup. + */ +void +MemoryContextInit(void) +{ + AssertState(TopMemoryContext == NULL); + + /* + * Initialize TopMemoryContext as an AllocSetContext with slow growth rate + * --- we don't really expect much to be allocated in it. + * + * (There is special-case code in MemoryContextCreate() for this call.) + * + * This context is shared between different threads and must be made + * thread-safe + */ + TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL, + "TopMemoryContext", + 0, + 8 * 1024, + 8 * 1024, + true); + + TopMostMemoryContext = TopMemoryContext; + + /* + * Not having any other place to point CurrentMemoryContext, make it point + * to TopMemoryContext. Caller should change this soon! + */ + CurrentMemoryContext = TopMemoryContext; + + /* + * Initialize ErrorContext as an AllocSetContext with slow growth rate --- + * we don't really expect much to be allocated in it. More to the point, + * require it to contain at least 8K at all times. This is the only case + * where retained memory in a context is *essential* --- we want to be + * sure ErrorContext still has some memory even if we've run out + * elsewhere! + * + * Similar to TopMostMemoryContext, this context may as well be shared + * between threads + */ + ErrorContext = AllocSetContextCreate(TopMemoryContext, + "ErrorContext", + 8 * 1024, + 8 * 1024, + 8 * 1024, + true); +} + +/* + * MemoryContextReset + * Release all space allocated within a context and its descendants, + * but don't delete the contexts themselves. + * + * The type-specific reset routine handles the context itself, but we + * have to do the recursion for the children. + */ +void +MemoryContextReset(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + + /* save a function call in common case where there are no children */ + if (context->firstchild != NULL) + MemoryContextResetChildren(context); + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); + + (*context->methods->reset) (context); +} + +/* + * MemoryContextResetChildren + * Release all space allocated within a context's descendants, + * but don't delete the contexts themselves. The named context + * itself is not touched. + */ +void +MemoryContextResetChildren(MemoryContext context) +{ + MemoryContext child; + + AssertArg(MemoryContextIsValid(context)); + + /* + * For a shared context, lock the parent context before resetting the + * children contextes + */ + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + + for (child = context->firstchild; child != NULL; child = child->nextchild) + MemoryContextReset(child); + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); +} + +/* + * MemoryContextDelete + * Delete a context and its descendants, and release all space + * allocated therein. + * + * The type-specific delete routine removes all subsidiary storage + * for the context, but we have to delete the context node itself, + * as well as recurse to get the children. We must also delink the + * node from its parent, if it has one. + */ +static void +MemoryContextDeleteInternal(MemoryContext context, bool parent_locked) +{ + AssertArg(MemoryContextIsValid(context)); + /* We had better not be deleting TopMemoryContext ... */ + Assert(context != TopMemoryContext); + /* And not CurrentMemoryContext, either */ + Assert(context != CurrentMemoryContext); + + MemoryContextDeleteChildren(context); + + /* + * We delink the context from its parent before deleting it, so that if + * there's an error we won't have deleted/busted contexts still attached + * to the context tree. Better a leak than a crash. + */ + if (context->parent) + { + MemoryContext parent = context->parent; + + /* + * If the parent context is shared and is already locked by the caller, + * no need to relock again. In fact, that's not the right thing to do + * since it will lead to a self-deadlock + */ + if (MemoryContextIsShared(parent) && (!parent_locked)) + MemoryContextLock(parent); + + if (context == parent->firstchild) + parent->firstchild = context->nextchild; + else + { + MemoryContext child; + + for (child = parent->firstchild; child; child = child->nextchild) + { + if (context == child->nextchild) + { + child->nextchild = context->nextchild; + break; + } + } + } + + if (MemoryContextIsShared(parent) && (!parent_locked)) + MemoryContextUnlock(parent); + } + (*context->methods->delete) (context); + pfree(context); +} + +void +MemoryContextDelete(MemoryContext context) +{ + MemoryContextDeleteInternal(context, false); +} + +/* + * MemoryContextDeleteChildren + * Delete all the descendants of the named context and release all + * space allocated therein. The named context itself is not touched. + */ +void +MemoryContextDeleteChildren(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + if (MemoryContextIsShared(context)) + MemoryContextLock(context); + /* + * MemoryContextDelete will delink the child from me, so just iterate as + * long as there is a child. + * + * Since the parent is already locked, pass that information to the child + * which would then not attempt to relock the parent + */ + while (context->firstchild != NULL) + MemoryContextDeleteInternal(context->firstchild, true); + + if (MemoryContextIsShared(context)) + MemoryContextUnlock(context); +} + +/* + * MemoryContextResetAndDeleteChildren + * Release all space allocated within a context and delete all + * its descendants. + * + * This is a common combination case where we want to preserve the + * specific context but get rid of absolutely everything under it. + */ +void +MemoryContextResetAndDeleteChildren(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + MemoryContextDeleteChildren(context); + (*context->methods->reset) (context); +} + +/* + * GetMemoryChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + * + * This is useful for measuring the total space occupied by a set of + * allocated chunks. + */ +Size +GetMemoryChunkSpace(void *pointer) +{ + StandardChunkHeader *header; + + /* + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + Assert(pointer != NULL); + Assert(pointer == (void *) MAXALIGN(pointer)); + + /* + * OK, it's probably safe to look at the chunk header. + */ + header = (StandardChunkHeader *) + ((char *) pointer - STANDARDCHUNKHEADERSIZE); + + AssertArg(MemoryContextIsValid(header->context)); + + return (*header->context->methods->get_chunk_space) (header->context, + pointer); +} + +/* + * GetMemoryChunkContext + * Given a currently-allocated chunk, determine the context + * it belongs to. + */ +MemoryContext +GetMemoryChunkContext(void *pointer) +{ + StandardChunkHeader *header; + + /* + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + Assert(pointer != NULL); + Assert(pointer == (void *) MAXALIGN(pointer)); + + /* + * OK, it's probably safe to look at the chunk header. + */ + header = (StandardChunkHeader *) + ((char *) pointer - STANDARDCHUNKHEADERSIZE); + + AssertArg(MemoryContextIsValid(header->context)); + + return header->context; +} + +/* + * MemoryContextIsEmpty + * Is a memory context empty of any allocated space? + */ +bool +MemoryContextIsEmpty(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + /* + * For now, we consider a memory context nonempty if it has any children; + * perhaps this should be changed later. + */ + if (context->firstchild != NULL) + return false; + /* Otherwise use the type-specific inquiry */ + return (*context->methods->is_empty) (context); +} + +/* + * MemoryContextStats + * Print statistics about the named context and all its descendants. + * + * This is just a debugging utility, so it's not fancy. The statistics + * are merely sent to stderr. + */ +void +MemoryContextStats(MemoryContext context) +{ + MemoryContextStatsInternal(context, 0); +} + +static void +MemoryContextStatsInternal(MemoryContext context, int level) +{ + MemoryContext child; + + AssertArg(MemoryContextIsValid(context)); + + (*context->methods->stats) (context, level); + for (child = context->firstchild; child != NULL; child = child->nextchild) + MemoryContextStatsInternal(child, level + 1); +} + +/* + * MemoryContextCheck + * Check all chunks in the named context. + * + * This is just a debugging utility, so it's not fancy. + */ +#ifdef MEMORY_CONTEXT_CHECKING +void +MemoryContextCheck(MemoryContext context) +{ + MemoryContext child; + + AssertArg(MemoryContextIsValid(context)); + + (*context->methods->check) (context); + for (child = context->firstchild; child != NULL; child = child->nextchild) + MemoryContextCheck(child); +} +#endif + +/* + * MemoryContextContains + * Detect whether an allocated chunk of memory belongs to a given + * context or not. + * + * Caution: this test is reliable as long as 'pointer' does point to + * a chunk of memory allocated from *some* context. If 'pointer' points + * at memory obtained in some other way, there is a small chance of a + * false-positive result, since the bits right before it might look like + * a valid chunk header by chance. + */ +bool +MemoryContextContains(MemoryContext context, void *pointer) +{ + StandardChunkHeader *header; + + /* + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + if (pointer == NULL || pointer != (void *) MAXALIGN(pointer)) + return false; + + /* + * OK, it's probably safe to look at the chunk header. + */ + header = (StandardChunkHeader *) + ((char *) pointer - STANDARDCHUNKHEADERSIZE); + + /* + * If the context link doesn't match then we certainly have a non-member + * chunk. Also check for a reasonable-looking size as extra guard against + * being fooled by bogus pointers. + */ + if (header->context == context && AllocSizeIsValid(header->size)) + return true; + return false; +} + +/*-------------------- + * MemoryContextCreate + * Context-type-independent part of context creation. + * + * This is only intended to be called by context-type-specific + * context creation routines, not by the unwashed masses. + * + * The context creation procedure is a little bit tricky because + * we want to be sure that we don't leave the context tree invalid + * in case of failure (such as insufficient memory to allocate the + * context node itself). The procedure goes like this: + * 1. Context-type-specific routine first calls MemoryContextCreate(), + * passing the appropriate tag/size/methods values (the methods + * pointer will ordinarily point to statically allocated data). + * The parent and name parameters usually come from the caller. + * 2. MemoryContextCreate() attempts to allocate the context node, + * plus space for the name. If this fails we can ereport() with no + * damage done. + * 3. We fill in all of the type-independent MemoryContext fields. + * 4. We call the type-specific init routine (using the methods pointer). + * The init routine is required to make the node minimally valid + * with zero chance of failure --- it can't allocate more memory, + * for example. + * 5. Now we have a minimally valid node that can behave correctly + * when told to reset or delete itself. We link the node to its + * parent (if any), making the node part of the context tree. + * 6. We return to the context-type-specific routine, which finishes + * up type-specific initialization. This routine can now do things + * that might fail (like allocate more memory), so long as it's + * sure the node is left in a state that delete will handle. + * + * This protocol doesn't prevent us from leaking memory if step 6 fails + * during creation of a top-level context, since there's no parent link + * in that case. However, if you run out of memory while you're building + * a top-level context, you might as well go home anyway... + * + * Normally, the context node and the name are allocated from + * TopMemoryContext (NOT from the parent context, since the node must + * survive resets of its parent context!). However, this routine is itself + * used to create TopMemoryContext! If we see that TopMemoryContext is NULL, + * we assume we are creating TopMemoryContext and use malloc() to allocate + * the node. + * + * Note that the name field of a MemoryContext does not point to + * separately-allocated storage, so it should not be freed at context + * deletion. + *-------------------- + */ +MemoryContext +MemoryContextCreate(Size size, + MemoryContextMethods *methods, + MemoryContext parent, + const char *name) +{ + MemoryContext node; + Size needed = size + strlen(name) + 1; + + + /* Get space for node and name */ + if (TopMemoryContext != NULL) + { + /* Normal case: allocate the node in TopMemoryContext */ + node = (MemoryContext) MemoryContextAlloc(TopMemoryContext, + needed); + } + else + { + /* Special case for startup: use good ol' malloc */ + node = (MemoryContext) malloc(needed); + Assert(node != NULL); + } + + /* Initialize the node as best we can */ + MemSet(node, 0, size); + node->methods = methods; + node->parent = NULL; /* for the moment */ + node->firstchild = NULL; + node->nextchild = NULL; + node->name = ((char *) node) + size; + strcpy(node->name, name); + + /* Type-specific routine finishes any other essential initialization */ + (*node->methods->init) (node); + + /* + * Lock the parent context if the it is shared and must be made thread-safe + */ + if ((parent != NULL) && (MemoryContextIsShared(parent))) + MemoryContextLock(parent); + + /* OK to link node to parent (if any) */ + if (parent) + { + node->parent = parent; + node->nextchild = parent->firstchild; + parent->firstchild = node; + } + + if ((parent != NULL) && (MemoryContextIsShared(parent))) + MemoryContextUnlock(parent); + + /* Return to type-specific creation routine to finish up */ + return node; +} + +/* + * MemoryContextAlloc + * Allocate space within the specified context. + * + * This could be turned into a macro, but we'd have to import + * nodes/memnodes.h into postgres.h which seems a bad idea. + */ +void * +MemoryContextAlloc(MemoryContext context, Size size) +{ + AssertArg(MemoryContextIsValid(context)); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %lu", + (unsigned long) size); + + return (*context->methods->alloc) (context, size); +} + +/* + * MemoryContextAllocZero + * Like MemoryContextAlloc, but clears allocated memory + * + * We could just call MemoryContextAlloc then clear the memory, but this + * is a very common combination, so we provide the combined operation. + */ +void * +MemoryContextAllocZero(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %lu", + (unsigned long) size); + + ret = (*context->methods->alloc) (context, size); + + MemSetAligned(ret, 0, size); + + return ret; +} + +/* + * MemoryContextAllocZeroAligned + * MemoryContextAllocZero where length is suitable for MemSetLoop + * + * This might seem overly specialized, but it's not because newNode() + * is so often called with compile-time-constant sizes. + */ +void * +MemoryContextAllocZeroAligned(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %lu", + (unsigned long) size); + + ret = (*context->methods->alloc) (context, size); + + MemSetLoop(ret, 0, size); + + return ret; +} + +/* + * pfree + * Release an allocated chunk. + */ +void +pfree(void *pointer) +{ + StandardChunkHeader *header; + + /* + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + Assert(pointer != NULL); + Assert(pointer == (void *) MAXALIGN(pointer)); + + /* + * OK, it's probably safe to look at the chunk header. + */ + header = (StandardChunkHeader *) + ((char *) pointer - STANDARDCHUNKHEADERSIZE); + + AssertArg(MemoryContextIsValid(header->context)); + + (*header->context->methods->free_p) (header->context, pointer); +} + +/* + * repalloc + * Adjust the size of a previously allocated chunk. + */ +void * +repalloc(void *pointer, Size size) +{ + StandardChunkHeader *header; + + /* + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + Assert(pointer != NULL); + Assert(pointer == (void *) MAXALIGN(pointer)); + + /* + * OK, it's probably safe to look at the chunk header. + */ + header = (StandardChunkHeader *) + ((char *) pointer - STANDARDCHUNKHEADERSIZE); + + AssertArg(MemoryContextIsValid(header->context)); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %lu", + (unsigned long) size); + + return (*header->context->methods->realloc) (header->context, + pointer, size); +} + +/* + * MemoryContextSwitchTo + * Returns the current context; installs the given context. + * + * This is inlined when using GCC. + * + * TODO: investigate supporting inlining for some non-GCC compilers. + */ +MemoryContext +MemoryContextSwitchTo(MemoryContext context) +{ + MemoryContext old; + + AssertArg(MemoryContextIsValid(context)); + + old = CurrentMemoryContext; + CurrentMemoryContext = context; + return old; +} + +/* + * MemoryContextStrdup + * Like strdup(), but allocate from the specified context + */ +char * +MemoryContextStrdup(MemoryContext context, const char *string) +{ + char *nstr; + Size len = strlen(string) + 1; + + nstr = (char *) MemoryContextAlloc(context, len); + + memcpy(nstr, string, len); + + return nstr; +} + +/* + * pnstrdup + * Like pstrdup(), but append null byte to a + * not-necessarily-null-terminated input string. + */ +char * +pnstrdup(const char *in, Size len) +{ + char *out = palloc(len + 1); + + memcpy(out, in, len); + out[len] = '\0'; + return out; +} + + +#if defined(WIN32) || defined(__CYGWIN__) +/* + * Memory support routines for libpgport on Win32 + * + * Win32 can't load a library that PGDLLIMPORTs a variable + * if the link object files also PGDLLIMPORT the same variable. + * For this reason, libpgport can't reference CurrentMemoryContext + * in the palloc macro calls. + * + * To fix this, we create several functions here that allow us to + * manage memory without doing the inline in libpgport. + */ +void * +pgport_palloc(Size sz) +{ + return palloc(sz); +} + + +char * +pgport_pstrdup(const char *str) +{ + return pstrdup(str); +} + + +/* Doesn't reference a PGDLLIMPORT variable, but here for completeness. */ +void +pgport_pfree(void *pointer) +{ + pfree(pointer); +} + +#endif diff --git a/src/gtm/common/stringinfo.c b/src/gtm/common/stringinfo.c new file mode 100644 index 0000000000..5023bd9893 --- /dev/null +++ b/src/gtm/common/stringinfo.c @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * + * stringinfo.c + * + * StringInfo provides an indefinitely-extensible string data type. + * It can be used to buffer either ordinary C strings (null-terminated text) + * or arbitrary binary data. All storage is allocated with palloc(). + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/backend/lib/stringinfo.c,v 1.49 2008/01/01 19:45:49 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" +#include "gtm/stringinfo.h" +#include "gtm/memutils.h" +#include "gtm/elog.h" +#include "gtm/assert.h" +#include "gtm/gtm.h" + + +/* + * makeStringInfo + * + * Create an empty 'StringInfoData' & return a pointer to it. + */ +StringInfo +makeStringInfo(void) +{ + StringInfo res; + + res = (StringInfo) palloc(sizeof(StringInfoData)); + + initStringInfo(res); + + return res; +} + +/* + * initStringInfo + * + * Initialize a StringInfoData struct (with previously undefined contents) + * to describe an empty string. + */ +void +initStringInfo(StringInfo str) +{ + int size = 1024; /* initial default buffer size */ + + str->data = (char *) palloc(size); + str->maxlen = size; + resetStringInfo(str); +} + +/* + * resetStringInfo + * + * Reset the StringInfo: the data buffer remains valid, but its + * previous content, if any, is cleared. + */ +void +resetStringInfo(StringInfo str) +{ + str->data[0] = '\0'; + str->len = 0; + str->cursor = 0; +} + +/* + * appendStringInfo + * + * Format text data under the control of fmt (an sprintf-style format string) + * and append it to whatever is already in str. More space is allocated + * to str if necessary. This is sort of like a combination of sprintf and + * strcat. + */ +void +appendStringInfo(StringInfo str, const char *fmt,...) +{ + for (;;) + { + va_list args; + bool success; + + /* Try to format the data. */ + va_start(args, fmt); + success = appendStringInfoVA(str, fmt, args); + va_end(args); + + if (success) + break; + + /* Double the buffer size and try again. */ + enlargeStringInfo(str, str->maxlen); + } +} + +/* + * appendStringInfoVA + * + * Attempt to format text data under the control of fmt (an sprintf-style + * format string) and append it to whatever is already in str. If successful + * return true; if not (because there's not enough space), return false + * without modifying str. Typically the caller would enlarge str and retry + * on false return --- see appendStringInfo for standard usage pattern. + * + * XXX This API is ugly, but there seems no alternative given the C spec's + * restrictions on what can portably be done with va_list arguments: you have + * to redo va_start before you can rescan the argument list, and we can't do + * that from here. + */ +bool +appendStringInfoVA(StringInfo str, const char *fmt, va_list args) +{ + int avail, + nprinted; + + Assert(str != NULL); + + /* + * If there's hardly any space, don't bother trying, just fail to make the + * caller enlarge the buffer first. + */ + avail = str->maxlen - str->len - 1; + if (avail < 16) + return false; + + /* + * Assert check here is to catch buggy vsnprintf that overruns the + * specified buffer length. Solaris 7 in 64-bit mode is an example of a + * platform with such a bug. + */ +#ifdef USE_ASSERT_CHECKING + str->data[str->maxlen - 1] = '\0'; +#endif + + nprinted = vsnprintf(str->data + str->len, avail, fmt, args); + + Assert(str->data[str->maxlen - 1] == '\0'); + + /* + * Note: some versions of vsnprintf return the number of chars actually + * stored, but at least one returns -1 on failure. Be conservative about + * believing whether the print worked. + */ + if (nprinted >= 0 && nprinted < avail - 1) + { + /* Success. Note nprinted does not include trailing null. */ + str->len += nprinted; + return true; + } + + /* Restore the trailing null so that str is unmodified. */ + str->data[str->len] = '\0'; + return false; +} + +/* + * appendStringInfoString + * + * Append a null-terminated string to str. + * Like appendStringInfo(str, "%s", s) but faster. + */ +void +appendStringInfoString(StringInfo str, const char *s) +{ + appendBinaryStringInfo(str, s, strlen(s)); +} + +/* + * appendStringInfoChar + * + * Append a single byte to str. + * Like appendStringInfo(str, "%c", ch) but much faster. + */ +void +appendStringInfoChar(StringInfo str, char ch) +{ + /* Make more room if needed */ + if (str->len + 1 >= str->maxlen) + enlargeStringInfo(str, 1); + + /* OK, append the character */ + str->data[str->len] = ch; + str->len++; + str->data[str->len] = '\0'; +} + +/* + * appendBinaryStringInfo + * + * Append arbitrary binary data to a StringInfo, allocating more space + * if necessary. + */ +void +appendBinaryStringInfo(StringInfo str, const char *data, int datalen) +{ + Assert(str != NULL); + + /* Make more room if needed */ + enlargeStringInfo(str, datalen); + + /* OK, append the data */ + memcpy(str->data + str->len, data, datalen); + str->len += datalen; + + /* + * Keep a trailing null in place, even though it's probably useless for + * binary data... + */ + str->data[str->len] = '\0'; +} + +/* + * enlargeStringInfo + * + * Make sure there is enough space for 'needed' more bytes + * ('needed' does not include the terminating null). + * + * External callers usually need not concern themselves with this, since + * all stringinfo.c routines do it automatically. However, if a caller + * knows that a StringInfo will eventually become X bytes large, it + * can save some palloc overhead by enlarging the buffer before starting + * to store data in it. + * + * NB: because we use repalloc() to enlarge the buffer, the string buffer + * will remain allocated in the same memory context that was current when + * initStringInfo was called, even if another context is now current. + * This is the desired and indeed critical behavior! + */ +void +enlargeStringInfo(StringInfo str, int needed) +{ + int newlen; + + /* + * Guard against out-of-range "needed" values. Without this, we can get + * an overflow or infinite loop in the following. + */ + if (needed < 0) /* should not happen */ + elog(ERROR, "invalid string enlargement request size: %d", needed); + if (((Size) needed) >= (MaxAllocSize - (Size) str->len)) + ereport(ERROR, + (ENOSPC, + errmsg("out of memory"), + errdetail("Cannot enlarge string buffer containing %d bytes by %d more bytes.", + str->len, needed))); + + needed += str->len + 1; /* total space required now */ + + /* Because of the above test, we now have needed <= MaxAllocSize */ + + if (needed <= str->maxlen) + return; /* got enough space already */ + + /* + * We don't want to allocate just a little more space with each append; + * for efficiency, double the buffer size each time it overflows. + * Actually, we might need to more than double it if 'needed' is big... + */ + newlen = 2 * str->maxlen; + while (needed > newlen) + newlen = 2 * newlen; + + /* + * Clamp to MaxAllocSize in case we went past it. Note we are assuming + * here that MaxAllocSize <= INT_MAX/2, else the above loop could + * overflow. We will still have newlen >= needed. + */ + if (newlen > (int) MaxAllocSize) + newlen = (int) MaxAllocSize; + + str->data = (char *) repalloc(str->data, newlen); + + str->maxlen = newlen; +} diff --git a/src/gtm/gtm_ctl/Makefile b/src/gtm/gtm_ctl/Makefile new file mode 100644 index 0000000000..eddcc9aebe --- /dev/null +++ b/src/gtm/gtm_ctl/Makefile @@ -0,0 +1,22 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +OBJS=gtm_ctl.o ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a +LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq + +LIBS=-lpthread + +gtm_ctl:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_ctl + +all:gtm_ctl + +clean: + rm -f $(OBJS) + rm -f gtm_ctl + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c new file mode 100644 index 0000000000..3b01796484 --- /dev/null +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -0,0 +1,918 @@ +/*------------------------------------------------------------------------- + * + * gtm_ctl --- start/stops/restarts the GTM server/proxy + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" + +#include <locale.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/time.h> +#include <sys/resource.h> +#endif + +#include "libpq/pqsignal.h" + +/* PID can be negative for standalone backend */ +typedef long pgpid_t; + +typedef enum +{ + SMART_MODE, + FAST_MODE, + IMMEDIATE_MODE +} ShutdownMode; + + +typedef enum +{ + NO_COMMAND = 0, + START_COMMAND, + STOP_COMMAND, + RESTART_COMMAND, +} CtlCommand; + +#define DEFAULT_WAIT 60 + +static bool do_wait = false; +static bool wait_set = false; +static int wait_seconds = DEFAULT_WAIT; +static bool silent_mode = false; +static ShutdownMode shutdown_mode = SMART_MODE; +static int sig = SIGTERM; /* default */ +static CtlCommand ctl_command = NO_COMMAND; +static char *gtm_data = NULL; +static char *gtmdata_opt = NULL; +static char *gtm_opts = NULL; +static const char *progname; +static char *log_file = NULL; +static char *gtm_path = NULL; +static char *gtm_app = NULL; +static char *argv0 = NULL; + +static void +write_stderr(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); +static void *pg_malloc(size_t size); +static char *xstrdup(const char *s); +static void do_advice(void); +static void do_help(void); +static void set_mode(char *modeopt); +static void do_start(void); +static void do_stop(void); +static void do_restart(void); +static void print_msg(const char *msg); + +static pgpid_t get_pgpid(void); +static char **readfile(const char *path); +static int start_gtm(void); +static void read_gtm_opts(void); + +static bool test_gtm_connection(); +static bool gtm_is_alive(pid_t pid); + +static char gtmopts_file[MAXPGPATH]; +static char pid_file[MAXPGPATH]; + +/* + * Write errors to stderr (or by equal means when stderr is + * not available). + */ +static void +write_stderr(const char *fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + + /* On Unix, we just fprintf to stderr */ + vfprintf(stderr, fmt, ap); + va_end(ap); +} + +/* + * routines to check memory allocations and fail noisily. + */ + +static void * +pg_malloc(size_t size) +{ + void *result; + + result = malloc(size); + if (!result) + { + write_stderr(_("%s: out of memory\n"), progname); + exit(1); + } + return result; +} + + +static char * +xstrdup(const char *s) +{ + char *result; + + result = strdup(s); + if (!result) + { + write_stderr(_("%s: out of memory\n"), progname); + exit(1); + } + return result; +} + +/* + * Given an already-localized string, print it to stdout unless the + * user has specified that no messages should be printed. + */ +static void +print_msg(const char *msg) +{ + if (!silent_mode) + { + fputs(msg, stdout); + fflush(stdout); + } +} + +static pgpid_t +get_pgpid(void) +{ + FILE *pidf; + long pid; + + pidf = fopen(pid_file, "r"); + if (pidf == NULL) + { + /* No pid file, not an error on startup */ + if (errno == ENOENT) + return 0; + else + { + write_stderr(_("%s: could not open PID file \"%s\": %s\n"), + progname, pid_file, strerror(errno)); + exit(1); + } + } + if (fscanf(pidf, "%ld", &pid) != 1) + { + write_stderr(_("%s: invalid data in PID file \"%s\"\n"), + progname, pid_file); + exit(1); + } + fclose(pidf); + return (pgpid_t) pid; +} + + +/* + * get the lines from a text file - return NULL if file can't be opened + */ +static char ** +readfile(const char *path) +{ + FILE *infile; + int maxlength = 0, + linelen = 0; + int nlines = 0; + char **result; + char *buffer; + int c; + + if ((infile = fopen(path, "r")) == NULL) + return NULL; + + /* pass over the file twice - the first time to size the result */ + + while ((c = fgetc(infile)) != EOF) + { + linelen++; + if (c == '\n') + { + nlines++; + if (linelen > maxlength) + maxlength = linelen; + linelen = 0; + } + } + + /* handle last line without a terminating newline (yuck) */ + if (linelen) + nlines++; + if (linelen > maxlength) + maxlength = linelen; + + /* set up the result and the line buffer */ + result = (char **) pg_malloc((nlines + 1) * sizeof(char *)); + buffer = (char *) pg_malloc(maxlength + 1); + + /* now reprocess the file and store the lines */ + rewind(infile); + nlines = 0; + while (fgets(buffer, maxlength + 1, infile) != NULL) + result[nlines++] = xstrdup(buffer); + + fclose(infile); + free(buffer); + result[nlines] = NULL; + + return result; +} + + + +/* + * start/test/stop routines + */ + +static int +start_gtm(void) +{ + char cmd[MAXPGPATH]; + /* + * Since there might be quotes to handle here, it is easier simply to pass + * everything to a shell to process them. + */ + + if (gtm_path != NULL) + { + strcat(gtm_path, "/"); + strcat(gtm_path, gtm_app); + } + else + gtm_path = gtm_app; + + if (log_file != NULL) + snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, + gtm_path, gtmdata_opt, gtm_opts, + DEVNULL, log_file); + else + snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, + gtm_path, gtmdata_opt, gtm_opts, DEVNULL); + + return system(cmd); +} + + + +/* + * Find the pgport and try a connection + */ +static bool +test_gtm_connection() +{ + GTM_Conn *conn; + bool success = false; + int i; + char portstr[32]; + char *p; + char *q; + char connstr[128]; /* Should be way more than enough! */ + + *portstr = '\0'; + + /* + * Look in gtm_opts for a -p switch. + * + * This parsing code is not amazingly bright; it could for instance + * get fooled if ' -p' occurs within a quoted argument value. Given + * that few people pass complicated settings in gtm_opts, it's + * probably good enough. + */ + for (p = gtm_opts; *p;) + { + /* advance past whitespace */ + while (isspace((unsigned char) *p)) + p++; + + if (strncmp(p, "-p", 2) == 0) + { + p += 2; + /* advance past any whitespace/quoting */ + while (isspace((unsigned char) *p) || *p == '\'' || *p == '"') + p++; + /* find end of value (not including any ending quote!) */ + q = p; + while (*q && + !(isspace((unsigned char) *q) || *q == '\'' || *q == '"')) + q++; + /* and save the argument value */ + strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr))); + /* keep looking, maybe there is another -p */ + p = q; + } + /* Advance to next whitespace */ + while (*p && !isspace((unsigned char) *p)) + p++; + } + + /* + * We need to set a connect timeout otherwise on Windows the SCM will + * probably timeout first + */ + snprintf(connstr, sizeof(connstr), + "host=localhost port=%s connect_timeout=5", portstr); + + for (i = 0; i < wait_seconds; i++) + { + if ((conn = PQconnectGTM(connstr)) != NULL && + (GTMPQstatus(conn) == CONNECTION_OK)) + { + GTMPQfinish(conn); + success = true; + break; + } + else + { + GTMPQfinish(conn); + print_msg("."); + sleep(1); /* 1 sec */ + } + } + + return success; +} + +static void +read_gtm_opts(void) +{ + if (gtm_opts == NULL) + { + gtm_opts = ""; /* default */ + if (ctl_command == RESTART_COMMAND) + { + char **optlines; + + optlines = readfile(gtmopts_file); + if (optlines == NULL) + { + write_stderr(_("%s: could not read file \"%s\"\n"), progname, gtmopts_file); + exit(1); + } + else if (optlines[0] == NULL || optlines[1] != NULL) + { + write_stderr(_("%s: option file \"%s\" must have exactly one line\n"), + progname, gtmopts_file); + exit(1); + } + else + { + int len; + char *optline; + char *arg1; + + optline = optlines[0]; + /* trim off line endings */ + len = strcspn(optline, "\r\n"); + optline[len] = '\0'; + + gtm_opts = arg1; + } + } + } +} + +static void +do_start(void) +{ + pgpid_t pid; + pgpid_t old_pid = 0; + int exitcode; + + if (ctl_command != RESTART_COMMAND) + { + old_pid = get_pgpid(); + if (old_pid != 0) + write_stderr(_("%s: another server might be running; " + "trying to start server anyway\n"), + progname); + } + + read_gtm_opts(); + + exitcode = start_gtm(); + if (exitcode != 0) + { + write_stderr(_("%s: could not start server: exit code was %d\n"), + progname, exitcode); + exit(1); + } + + if (old_pid != 0) + { + sleep(1); + pid = get_pgpid(); + if (pid == old_pid) + { + write_stderr(_("%s: could not start server\n" + "Examine the log output.\n"), + progname); + exit(1); + } + } + + if (do_wait) + { + print_msg(_("waiting for server to start...")); + + if (test_gtm_connection() == false) + { + printf(_("could not start server\n")); + exit(1); + } + else + { + print_msg(_(" done\n")); + print_msg(_("server started\n")); + } + } + else + print_msg(_("server starting\n")); +} + + +static void +do_stop(void) +{ + int cnt; + pgpid_t pid; + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + write_stderr(_("%s: cannot stop server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + if (!do_wait) + { + print_msg(_("server shutting down\n")); + return; + } + else + { + print_msg(_("waiting for server to shut down...")); + + for (cnt = 0; cnt < wait_seconds; cnt++) + { + if ((pid = get_pgpid()) != 0) + { + print_msg("."); + sleep(1); /* 1 sec */ + } + else + break; + } + + if (pid != 0) /* pid file still exists */ + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not shut down\n"), progname); + exit(1); + } + print_msg(_(" done\n")); + + printf(_("server stopped\n")); + } +} + + +/* + * restart/reload routines + */ + +static void +do_restart(void) +{ + int cnt; + pgpid_t pid; + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), + progname, pid_file); + write_stderr(_("Is server running?\n")); + write_stderr(_("starting server anyway\n")); + do_start(); + return; + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + if (gtm_is_alive((pid_t) pid)) + { + write_stderr(_("%s: cannot restart server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + write_stderr(_("Please terminate the single-user server and try again.\n")); + exit(1); + } + } + + if (gtm_is_alive((pid_t) pid)) + { + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + print_msg(_("waiting for server to shut down...")); + + /* always wait for restart */ + + for (cnt = 0; cnt < wait_seconds; cnt++) + { + if ((pid = get_pgpid()) != 0) + { + print_msg("."); + sleep(1); /* 1 sec */ + } + else + break; + } + + if (pid != 0) /* pid file still exists */ + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not shut down\n"), progname); + exit(1); + } + + print_msg(_(" done\n")); + printf(_("server stopped\n")); + } + else + { + write_stderr(_("%s: old server process (PID: %ld) seems to be gone\n"), + progname, pid); + write_stderr(_("starting server anyway\n")); + } + + do_start(); +} + + +/* + * utility routines + */ + +static bool +gtm_is_alive(pid_t pid) +{ + /* + * Test to see if the process is still there. Note that we do not + * consider an EPERM failure to mean that the process is still there; + * EPERM must mean that the given PID belongs to some other userid, and + * considering the permissions on $GTMDATA, that means it's not the + * gtm we are after. + * + * Don't believe that our own PID or parent shell's PID is the gtm, + * either. (Windows hasn't got getppid(), though.) + */ + if (pid == getpid()) + return false; +#ifndef WIN32 + if (pid == getppid()) + return false; +#endif + if (kill(pid, 0) == 0) + return true; + return false; +} + +static void +do_advice(void) +{ + write_stderr(_("Try \"%s --help\" for more information.\n"), progname); +} + + +static void +do_help(void) +{ + printf(_("%s is a utility to start, stop or restart,\n" + "a GTM server or GTM proxy.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s start -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname); + printf(_(" %s stop -S STARTUP_MODE [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname); + printf(_(" %s restart -S STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n" + " [-o \"OPTIONS\"]\n"), progname); + + printf(_("\nCommon options:\n")); + printf(_(" -D DATADIR location of the database storage area\n")); + printf(_(" -S set gtm or gtm_proxy to launch one of them\n")); + printf(_(" -s, only print errors, no informational messages\n")); + printf(_(" -t SECS seconds to wait when using -w option\n")); + printf(_(" -w wait until operation completes\n")); + printf(_(" -W do not wait until operation completes\n")); + printf(_(" --help show this help, then exit\n")); + printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n")); + + printf(_("\nOptions for start or restart:\n")); + printf(_(" -S STARTUP-MODE can be \"gtm\" or \"gtm_proxy\"\n")); + printf(_(" -l FILENAME write (or append) server log to FILENAME\n")); + printf(_(" -o OPTIONS command line options to pass to gtm\n" + " (GTM server executable)\n")); + printf(_(" -p PATH-TO-GTM/PROXY path to gtm/gtm_proxy executables\n")); + printf(_("\nOptions for stop or restart:\n")); + printf(_(" -m SHUTDOWN-MODE can be \"smart\", \"fast\", or \"immediate\"\n")); + + printf(_("\nShutdown modes are:\n")); + printf(_(" smart quit after all clients have disconnected\n")); + printf(_(" fast quit directly, with proper shutdown\n")); + printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n")); +} + + +static void +set_mode(char *modeopt) +{ + if (strcmp(modeopt, "s") == 0 || strcmp(modeopt, "smart") == 0) + { + shutdown_mode = SMART_MODE; + sig = SIGTERM; + } + else if (strcmp(modeopt, "f") == 0 || strcmp(modeopt, "fast") == 0) + { + shutdown_mode = FAST_MODE; + sig = SIGINT; + } + else if (strcmp(modeopt, "i") == 0 || strcmp(modeopt, "immediate") == 0) + { + shutdown_mode = IMMEDIATE_MODE; + sig = SIGQUIT; + } + else + { + write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"), progname, modeopt); + do_advice(); + exit(1); + } +} + +int +main(int argc, char **argv) +{ + int c; + + progname = "gtm_ctl"; + + /* + * save argv[0] so do_start() can look for the gtm if necessary. we + * don't look for gtm here because in many cases we won't need it. + */ + argv0 = argv[0]; + + umask(077); + + /* support --help and --version even if invoked as root */ + if (argc > 1) + { + if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 || + strcmp(argv[1], "-?") == 0) + { + do_help(); + exit(0); + } + } + + /* + * Disallow running as root, to forestall any possible security holes. + */ + if (geteuid() == 0) + { + write_stderr(_("%s: cannot be run as root\n" + "Please log in (using, e.g., \"su\") as the " + "(unprivileged) user that will\n" + "own the server process.\n"), + progname); + exit(1); + } + + /* + * 'Action' can be before or after args so loop over both. Some + * getopt_long() implementations will reorder argv[] to place all flags + * first (GNU?), but we don't rely on it. Our /port version doesn't do + * that. + */ + optind = 1; + + /* process command-line options */ + while (optind < argc) + { + while ((c = getopt(argc, argv, "D:l:m:o:p:S:t:wW")) != -1) + { + switch (c) + { + case 'D': + { + char *gtmdata_D; + char *env_var = pg_malloc(strlen(optarg) + 9); + + gtmdata_D = xstrdup(optarg); + canonicalize_path(gtmdata_D); + snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s", + gtmdata_D); + putenv(env_var); + + /* + * We could pass GTMDATA just in an environment + * variable but we do -D too for clearer gtm + * 'ps' display + */ + gtmdata_opt = pg_malloc(strlen(gtmdata_D) + 8); + snprintf(gtmdata_opt, strlen(gtmdata_D) + 8, + "-D \"%s\" ", + gtmdata_D); + break; + } + case 'l': + log_file = xstrdup(optarg); + break; + case 'm': + set_mode(optarg); + break; + case 'o': + gtm_opts = xstrdup(optarg); + break; + case 'p': + gtm_path = xstrdup(optarg); + canonicalize_path(gtm_path); + break; + case 'S': + gtm_app = xstrdup(optarg); + if (strcmp(gtm_app,"gtm_proxy") != 0 + && strcmp(gtm_app,"gtm") != 0) + { + write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app); + do_advice(); + exit(1); + } + break; + case 't': + wait_seconds = atoi(optarg); + break; + case 'w': + do_wait = true; + wait_set = true; + break; + case 'W': + do_wait = false; + wait_set = true; + break; + default: + /* getopt_long already issued a suitable error message */ + do_advice(); + exit(1); + } + } + + /* Process an action */ + if (optind < argc) + { + if (ctl_command != NO_COMMAND) + { + write_stderr(_("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind]); + do_advice(); + exit(1); + } + + if (strcmp(argv[optind], "start") == 0) + ctl_command = START_COMMAND; + else if (strcmp(argv[optind], "stop") == 0) + ctl_command = STOP_COMMAND; + else if (strcmp(argv[optind], "restart") == 0) + ctl_command = RESTART_COMMAND; + else + { + write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), progname, argv[optind]); + do_advice(); + exit(1); + } + optind++; + } + } + + if (ctl_command == NO_COMMAND) + { + write_stderr(_("%s: no operation specified\n"), progname); + do_advice(); + exit(1); + } + + gtm_data = getenv("GTMDATA"); + + if (gtm_data) + { + gtm_data = xstrdup(gtm_data); + canonicalize_path(gtm_data); + } + + if (!gtm_data) + { + write_stderr("%s: no database directory specified \n", + progname); + do_advice(); + exit(1); + } + + /* + * pid files of gtm and gtm proxy are named differently + * -S option has also to be set for STOP_COMMAND + * or gtm_ctl will not be able to find the correct pid_file + */ + if (!gtm_app) + { + write_stderr("%s: launcher name non specified, see option -S\n", + progname); + do_advice(); + exit(1); + } + + if (!wait_set) + { + switch (ctl_command) + { + case RESTART_COMMAND: + case START_COMMAND: + do_wait = false; + break; + case STOP_COMMAND: + do_wait = true; + break; + default: + break; + } + } + + if (gtm_data) + { + if (strcmp(gtm_app,"gtm_proxy") == 0) + { + snprintf(pid_file, MAXPGPATH, "%s/gtm_proxy.pid", gtm_data); + snprintf(gtmopts_file, MAXPGPATH, "%s/gtm_proxy.opts", gtm_data); + } + else if (strcmp(gtm_app,"gtm") == 0) + { + snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data); + snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data); + } + } + + switch (ctl_command) + { + case START_COMMAND: + do_start(); + break; + case STOP_COMMAND: + do_stop(); + break; + case RESTART_COMMAND: + do_restart(); + break; + default: + break; + } + + exit(0); +} diff --git a/src/gtm/libpq/Makefile b/src/gtm/libpq/Makefile new file mode 100644 index 0000000000..9036ba8547 --- /dev/null +++ b/src/gtm/libpq/Makefile @@ -0,0 +1,22 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +NAME=pqcomm +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 + +OBJS=ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o + +all:all-lib + +include $(top_build_dir)/Makefile.shlib + +clean: + rm -f $(OBJS) + rm -f libpqcomm.so libpqcomm.so.1 libpqcomm.so.1.0 + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/libpq/ip.c b/src/gtm/libpq/ip.c new file mode 100644 index 0000000000..561161410d --- /dev/null +++ b/src/gtm/libpq/ip.c @@ -0,0 +1,324 @@ +/*------------------------------------------------------------------------- + * + * ip.c + * IPv6-aware network access. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/libpq/ip.c,v 1.43 2009/01/01 17:23:42 momjian Exp $ + * + * This file and the IPV6 implementation were initially provided by + * Nigel Kukard <[email protected]>, Linux Based Systems Design + * http://www.lbsd.net. + * + *------------------------------------------------------------------------- + */ + +/* This is intended to be used in both frontend and backend, so use c.h */ +#include "gtm/gtm_c.h" + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <netdb.h> +#include <netinet/in.h> +#ifdef HAVE_NETINET_TCP_H +#include <netinet/tcp.h> +#endif +#include <arpa/inet.h> +#include <sys/file.h> + +#include "gtm/ip.h" + + +static int range_sockaddr_AF_INET(const struct sockaddr_in * addr, + const struct sockaddr_in * netaddr, + const struct sockaddr_in * netmask); + +#ifdef HAVE_IPV6 +static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, + const struct sockaddr_in6 * netaddr, + const struct sockaddr_in6 * netmask); +#endif + + +/* + * pg_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets + */ +int +pg_getaddrinfo_all(const char *hostname, const char *servname, + const struct addrinfo * hintp, struct addrinfo ** result) +{ + int rc; + + /* not all versions of getaddrinfo() zero *result on failure */ + *result = NULL; + + /* NULL has special meaning to getaddrinfo(). */ + rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname, + servname, hintp, result); + + return rc; +} + + +/* + * pg_freeaddrinfo_all - free addrinfo structures for IPv4, IPv6, or Unix + * + * Note: the ai_family field of the original hint structure must be passed + * so that we can tell whether the addrinfo struct was built by the system's + * getaddrinfo() routine or our own getaddrinfo_unix() routine. Some versions + * of getaddrinfo() might be willing to return AF_UNIX addresses, so it's + * not safe to look at ai_family in the addrinfo itself. + */ +void +pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai) +{ + { + /* struct was built by getaddrinfo() */ + if (ai != NULL) + freeaddrinfo(ai); + } +} + + +/* + * pg_getnameinfo_all - get name info for Unix, IPv4 and IPv6 sockets + * + * The API of this routine differs from the standard getnameinfo() definition + * in two ways: first, the addr parameter is declared as sockaddr_storage + * rather than struct sockaddr, and second, the node and service fields are + * guaranteed to be filled with something even on failure return. + */ +int +pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags) +{ + int rc; + + rc = getnameinfo((const struct sockaddr *) addr, salen, + node, nodelen, + service, servicelen, + flags); + + if (rc != 0) + { + if (node) + strlcpy(node, "???", nodelen); + if (service) + strlcpy(service, "???", servicelen); + } + + return rc; +} + +/* + * pg_range_sockaddr - is addr within the subnet specified by netaddr/netmask ? + * + * Note: caller must already have verified that all three addresses are + * in the same address family; and AF_UNIX addresses are not supported. + */ +int +pg_range_sockaddr(const struct sockaddr_storage * addr, + const struct sockaddr_storage * netaddr, + const struct sockaddr_storage * netmask) +{ + if (addr->ss_family == AF_INET) + return range_sockaddr_AF_INET((struct sockaddr_in *) addr, + (struct sockaddr_in *) netaddr, + (struct sockaddr_in *) netmask); +#ifdef HAVE_IPV6 + else if (addr->ss_family == AF_INET6) + return range_sockaddr_AF_INET6((struct sockaddr_in6 *) addr, + (struct sockaddr_in6 *) netaddr, + (struct sockaddr_in6 *) netmask); +#endif + else + return 0; +} + +static int +range_sockaddr_AF_INET(const struct sockaddr_in * addr, + const struct sockaddr_in * netaddr, + const struct sockaddr_in * netmask) +{ + if (((addr->sin_addr.s_addr ^ netaddr->sin_addr.s_addr) & + netmask->sin_addr.s_addr) == 0) + return 1; + else + return 0; +} + + +#ifdef HAVE_IPV6 + +static int +range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, + const struct sockaddr_in6 * netaddr, + const struct sockaddr_in6 * netmask) +{ + int i; + + for (i = 0; i < 16; i++) + { + if (((addr->sin6_addr.s6_addr[i] ^ netaddr->sin6_addr.s6_addr[i]) & + netmask->sin6_addr.s6_addr[i]) != 0) + return 0; + } + + return 1; +} +#endif /* HAVE_IPV6 */ + +/* + * pg_sockaddr_cidr_mask - make a network mask of the appropriate family + * and required number of significant bits + * + * The resulting mask is placed in *mask, which had better be big enough. + * + * Return value is 0 if okay, -1 if not. + */ +int +pg_sockaddr_cidr_mask(struct sockaddr_storage * mask, char *numbits, int family) +{ + long bits; + char *endptr; + + bits = strtol(numbits, &endptr, 10); + + if (*numbits == '\0' || *endptr != '\0') + return -1; + + switch (family) + { + case AF_INET: + { + struct sockaddr_in mask4; + long maskl; + + if (bits < 0 || bits > 32) + return -1; + /* avoid "x << 32", which is not portable */ + if (bits > 0) + maskl = (0xffffffffUL << (32 - (int) bits)) + & 0xffffffffUL; + else + maskl = 0; + mask4.sin_addr.s_addr = htonl(maskl); + memcpy(mask, &mask4, sizeof(mask4)); + break; + } + +#ifdef HAVE_IPV6 + case AF_INET6: + { + struct sockaddr_in6 mask6; + int i; + + if (bits < 0 || bits > 128) + return -1; + for (i = 0; i < 16; i++) + { + if (bits <= 0) + mask6.sin6_addr.s6_addr[i] = 0; + else if (bits >= 8) + mask6.sin6_addr.s6_addr[i] = 0xff; + else + { + mask6.sin6_addr.s6_addr[i] = + (0xff << (8 - (int) bits)) & 0xff; + } + bits -= 8; + } + memcpy(mask, &mask6, sizeof(mask6)); + break; + } +#endif + default: + return -1; + } + + mask->ss_family = family; + return 0; +} + + +#ifdef HAVE_IPV6 + +/* + * pg_promote_v4_to_v6_addr --- convert an AF_INET addr to AF_INET6, using + * the standard convention for IPv4 addresses mapped into IPv6 world + * + * The passed addr is modified in place; be sure it is large enough to + * hold the result! Note that we only worry about setting the fields + * that pg_range_sockaddr will look at. + */ +void +pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + uint32 ip4addr; + + memcpy(&addr4, addr, sizeof(addr4)); + ip4addr = ntohl(addr4.sin_addr.s_addr); + + memset(&addr6, 0, sizeof(addr6)); + + addr6.sin6_family = AF_INET6; + + addr6.sin6_addr.s6_addr[10] = 0xff; + addr6.sin6_addr.s6_addr[11] = 0xff; + addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF; + addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF; + addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF; + addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF; + + memcpy(addr, &addr6, sizeof(addr6)); +} + +/* + * pg_promote_v4_to_v6_mask --- convert an AF_INET netmask to AF_INET6, using + * the standard convention for IPv4 addresses mapped into IPv6 world + * + * This must be different from pg_promote_v4_to_v6_addr because we want to + * set the high-order bits to 1's not 0's. + * + * The passed addr is modified in place; be sure it is large enough to + * hold the result! Note that we only worry about setting the fields + * that pg_range_sockaddr will look at. + */ +void +pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + uint32 ip4addr; + int i; + + memcpy(&addr4, addr, sizeof(addr4)); + ip4addr = ntohl(addr4.sin_addr.s_addr); + + memset(&addr6, 0, sizeof(addr6)); + + addr6.sin6_family = AF_INET6; + + for (i = 0; i < 12; i++) + addr6.sin6_addr.s6_addr[i] = 0xff; + + addr6.sin6_addr.s6_addr[12] = (ip4addr >> 24) & 0xFF; + addr6.sin6_addr.s6_addr[13] = (ip4addr >> 16) & 0xFF; + addr6.sin6_addr.s6_addr[14] = (ip4addr >> 8) & 0xFF; + addr6.sin6_addr.s6_addr[15] = (ip4addr) & 0xFF; + + memcpy(addr, &addr6, sizeof(addr6)); +} + +#endif /* HAVE_IPV6 */ diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c new file mode 100644 index 0000000000..e697a7f4b1 --- /dev/null +++ b/src/gtm/libpq/pqcomm.c @@ -0,0 +1,1130 @@ +/*------------------------------------------------------------------------- + * + * pqcomm.c + * Communication functions between the Frontend and the Backend + * + * These routines handle the low-level details of communication between + * frontend and backend. They just shove data across the communication + * channel, and are ignorant of the semantics of the data --- or would be, + * except for major brain damage in the design of the old COPY OUT protocol. + * Unfortunately, COPY OUT was designed to commandeer the communication + * channel (it just transfers data without wrapping it into messages). + * No other messages can be sent while COPY OUT is in progress; and if the + * copy is aborted by an ereport(ERROR), we need to close out the copy so that + * the frontend gets back into sync. Therefore, these routines have to be + * aware of COPY OUT state. (New COPY-OUT is message-based and does *not* + * set the DoingCopyOut flag.) + * + * NOTE: generally, it's a bad idea to emit outgoing messages directly with + * pq_putbytes(), especially if the message would require multiple calls + * to send. Instead, use the routines in pqformat.c to construct the message + * in a buffer and then emit it in one call to pq_putmessage. This ensures + * that the channel will not be clogged by an incomplete message if execution + * is aborted by ereport(ERROR) partway through the message. The only + * non-libpq code that should call pq_putbytes directly is old-style COPY OUT. + * + * At one time, libpq was shared between frontend and backend, but now + * the backend's "backend/libpq" is quite separate from "interfaces/libpq". + * All that remains is similarities of names to trap the unwary... + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/backend/libpq/pqcomm.c,v 1.198 2008/01/01 19:45:49 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + +/*------------------------ + * INTERFACE ROUTINES + * + * setup/teardown: + * StreamServerPort - Open postmaster's server port + * StreamConnection - Create new connection with client + * StreamClose - Close a client/backend connection + * TouchSocketFile - Protect socket file against /tmp cleaners + * pq_init - initialize libpq at backend startup + * pq_comm_reset - reset libpq during error recovery + * pq_close - shutdown libpq at backend exit + * + * low-level I/O: + * pq_getbytes - get a known number of bytes from connection + * pq_getstring - get a null terminated string from connection + * pq_getmessage - get a message with length word from connection + * pq_getbyte - get next byte from connection + * pq_peekbyte - peek at next byte from connection + * pq_putbytes - send bytes to connection (not flushed until pq_flush) + * pq_flush - flush pending output + * + * message-level I/O (and old-style-COPY-OUT cruft): + * pq_putmessage - send a normal message (suppressed in COPY OUT mode) + * pq_startcopyout - inform libpq that a COPY OUT transfer is beginning + * pq_endcopyout - end a COPY OUT transfer + * + *------------------------ + */ + +#include <signal.h> +#include <fcntl.h> +#include <grp.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <netdb.h> +#include <netinet/in.h> +#ifdef HAVE_NETINET_TCP_H +#include <netinet/tcp.h> +#endif +#include <arpa/inet.h> +#ifdef HAVE_UTIME_H +#include <utime.h> +#endif + +#include "gtm/gtm_c.h" +#include "gtm/ip.h" +#include "gtm/libpq.h" +#include "gtm/libpq-be.h" +#include "gtm/elog.h" + +#define MAXGTMPATH 256 + +/* Where the Unix socket file is */ +static char sock_path[MAXGTMPATH]; + +static int tcp_keepalives_idle; +static int tcp_keepalives_interval; +static int tcp_keepalives_count; + + +/* + * Buffers for low-level I/O + */ + +/* Internal functions */ +static int internal_putbytes(Port *myport, const char *s, size_t len); +static int internal_flush(Port *myport); + +/* + * Streams -- wrapper around Unix socket system calls + * + * + * Stream functions are used for vanilla TCP connection protocol. + */ + + +/* + * StreamServerPort -- open a "listening" port to accept connections. + * + * Successfully opened sockets are added to the ListenSocket[] array, + * at the first position that isn't -1. + * + * RETURNS: STATUS_OK or STATUS_ERROR + */ + +int +StreamServerPort(int family, char *hostName, unsigned short portNumber, + int ListenSocket[], int MaxListen) +{ + int fd, + err; + int maxconn; + int ret; + char portNumberStr[32]; + const char *familyDesc; + char familyDescBuf[64]; + char *service; + struct addrinfo *addrs = NULL, + *addr; + struct addrinfo hint; + int listen_index = 0; + int added = 0; + +#if !defined(WIN32) || defined(IPV6_V6ONLY) + int one = 1; +#endif + + /* Initialize hint structure */ + MemSet(&hint, 0, sizeof(hint)); + hint.ai_family = family; + hint.ai_flags = AI_PASSIVE; + hint.ai_socktype = SOCK_STREAM; + + { + snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber); + service = portNumberStr; + } + + ret = pg_getaddrinfo_all(hostName, service, &hint, &addrs); + if (ret || !addrs) + { + if (hostName) + ereport(LOG, + (errmsg("could not translate host name \"%s\", service \"%s\" to address: %s", + hostName, service, gai_strerror(ret)))); + else + ereport(LOG, + (errmsg("could not translate service \"%s\" to address: %s", + service, gai_strerror(ret)))); + if (addrs) + pg_freeaddrinfo_all(hint.ai_family, addrs); + return STATUS_ERROR; + } + + for (addr = addrs; addr; addr = addr->ai_next) + { + if (!IS_AF_UNIX(family) && IS_AF_UNIX(addr->ai_family)) + { + /* + * Only set up a unix domain socket when they really asked for it. + * The service/port is different in that case. + */ + continue; + } + + /* See if there is still room to add 1 more socket. */ + for (; listen_index < MaxListen; listen_index++) + { + if (ListenSocket[listen_index] == -1) + break; + } + if (listen_index >= MaxListen) + { + ereport(LOG, + (errmsg("could not bind to all requested addresses: MAXLISTEN (%d) exceeded", + MaxListen))); + break; + } + + /* set up family name for possible error messages */ + switch (addr->ai_family) + { + case AF_INET: + familyDesc = "IPv4"; + break; +#ifdef HAVE_IPV6 + case AF_INET6: + familyDesc = "IPv6"; + break; +#endif + default: + snprintf(familyDescBuf, sizeof(familyDescBuf), + "unrecognized address family %d", + addr->ai_family); + familyDesc = familyDescBuf; + break; + } + + if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0) + { + ereport(LOG, + (EACCES, + /* translator: %s is IPv4, IPv6, or Unix */ + errmsg("could not create %s socket: %m", + familyDesc))); + continue; + } + +#ifndef WIN32 + + /* + * Without the SO_REUSEADDR flag, a new postmaster can't be started + * right away after a stop or crash, giving "address already in use" + * error on TCP ports. + * + * On win32, however, this behavior only happens if the + * SO_EXLUSIVEADDRUSE is set. With SO_REUSEADDR, win32 allows multiple + * servers to listen on the same address, resulting in unpredictable + * behavior. With no flags at all, win32 behaves as Unix with + * SO_REUSEADDR. + */ + if (!IS_AF_UNIX(addr->ai_family)) + { + if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + (char *) &one, sizeof(one))) == -1) + { + ereport(LOG, + (EACCES, + errmsg("setsockopt(SO_REUSEADDR) failed: %m"))); + close(fd); + continue; + } + } +#endif + +#ifdef IPV6_V6ONLY + if (addr->ai_family == AF_INET6) + { + if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &one, sizeof(one)) == -1) + { + ereport(LOG, + (EACCES, + errmsg("setsockopt(IPV6_V6ONLY) failed: %m"))); + close(fd); + continue; + } + } +#endif + + /* + * Note: This might fail on some OS's, like Linux older than + * 2.4.21-pre3, that don't have the IPV6_V6ONLY socket option, and map + * ipv4 addresses to ipv6. It will show ::ffff:ipv4 for all ipv4 + * connections. + */ + err = bind(fd, addr->ai_addr, addr->ai_addrlen); + if (err < 0) + { + ereport(LOG, + (EACCES, + /* translator: %s is IPv4, IPv6, or Unix */ + errmsg("could not bind %s socket: %m", + familyDesc), + (IS_AF_UNIX(addr->ai_family)) ? + errhint("Is another postmaster already running on port %d?" + " If not, remove socket file \"%s\" and retry.", + (int) portNumber, sock_path) : + errhint("Is another postmaster already running on port %d?" + " If not, wait a few seconds and retry.", + (int) portNumber))); + close(fd); + continue; + } + +#define GTM_MAX_CONNECTIONS 1024 + + /* + * Select appropriate accept-queue length limit. PG_SOMAXCONN is only + * intended to provide a clamp on the request on platforms where an + * overly large request provokes a kernel error (are there any?). + */ + maxconn = GTM_MAX_CONNECTIONS * 2; + + err = listen(fd, maxconn); + if (err < 0) + { + ereport(LOG, + (EACCES, + /* translator: %s is IPv4, IPv6, or Unix */ + errmsg("could not listen on %s socket: %m", + familyDesc))); + close(fd); + continue; + } + ListenSocket[listen_index] = fd; + added++; + } + + pg_freeaddrinfo_all(hint.ai_family, addrs); + + if (!added) + return STATUS_ERROR; + + return STATUS_OK; +} + + +/* + * StreamConnection -- create a new connection with client using + * server port. Set port->sock to the FD of the new connection. + * + * ASSUME: that this doesn't need to be non-blocking because + * the Postmaster uses select() to tell when the server master + * socket is ready for accept(). + * + * RETURNS: STATUS_OK or STATUS_ERROR + */ +int +StreamConnection(int server_fd, Port *port) +{ + /* accept connection and fill in the client (remote) address */ + port->raddr.salen = sizeof(port->raddr.addr); + if ((port->sock = accept(server_fd, + (struct sockaddr *) & port->raddr.addr, + &port->raddr.salen)) < 0) + { + ereport(LOG, + (EACCES, + errmsg("could not accept new connection: %m"))); + + /* + * If accept() fails then postmaster.c will still see the server + * socket as read-ready, and will immediately try again. To avoid + * uselessly sucking lots of CPU, delay a bit before trying again. + * (The most likely reason for failure is being out of kernel file + * table slots; we can do little except hope some will get freed up.) + */ + /* pg_usleep(100000L); */ /* wait 0.1 sec */ + return STATUS_ERROR; + } + +#ifdef SCO_ACCEPT_BUG + + /* + * UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it + * shouldn't hurt to catch it for all versions of those platforms. + */ + if (port->raddr.addr.ss_family == 0) + port->raddr.addr.ss_family = AF_UNIX; +#endif + + /* fill in the server (local) address */ + port->laddr.salen = sizeof(port->laddr.addr); + if (getsockname(port->sock, + (struct sockaddr *) & port->laddr.addr, + &port->laddr.salen) < 0) + { + elog(LOG, "getsockname() failed: %m"); + return STATUS_ERROR; + } + + /* select NODELAY and KEEPALIVE options if it's a TCP connection */ + if (!IS_AF_UNIX(port->laddr.addr.ss_family)) + { + int on; + +#ifdef TCP_NODELAY + on = 1; + if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY, + (char *) &on, sizeof(on)) < 0) + { + elog(LOG, "setsockopt(TCP_NODELAY) failed: %m"); + return STATUS_ERROR; + } +#endif + on = 1; + if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE, + (char *) &on, sizeof(on)) < 0) + { + elog(LOG, "setsockopt(SO_KEEPALIVE) failed: %m"); + return STATUS_ERROR; + } + + /* + * Also apply the current keepalive parameters. If we fail to set a + * parameter, don't error out, because these aren't universally + * supported. (Note: you might think we need to reset the GUC + * variables to 0 in such a case, but it's not necessary because the + * show hooks for these variables report the truth anyway.) + */ + (void) pq_setkeepalivesidle(tcp_keepalives_idle, port); + (void) pq_setkeepalivesinterval(tcp_keepalives_interval, port); + (void) pq_setkeepalivescount(tcp_keepalives_count, port); + } + + return STATUS_OK; +} + +/* + * StreamClose -- close a client/backend connection + * + * NOTE: this is NOT used to terminate a session; it is just used to release + * the file descriptor in a process that should no longer have the socket + * open. (For example, the postmaster calls this after passing ownership + * of the connection to a child process.) It is expected that someone else + * still has the socket open. So, we only want to close the descriptor, + * we do NOT want to send anything to the far end. + */ +void +StreamClose(int sock) +{ + close(sock); +} + +/* + * TouchSocketFile -- mark socket file as recently accessed + * + * This routine should be called every so often to ensure that the socket + * file has a recent mod date (ordinary operations on sockets usually won't + * change the mod date). That saves it from being removed by + * overenthusiastic /tmp-directory-cleaner daemons. (Another reason we should + * never have put the socket file in /tmp...) + */ +void +TouchSocketFile(void) +{ + /* Do nothing if we did not create a socket... */ + if (sock_path[0] != '\0') + { + /* + * utime() is POSIX standard, utimes() is a common alternative. If we + * have neither, there's no way to affect the mod or access time of + * the socket :-( + * + * In either path, we ignore errors; there's no point in complaining. + */ +#ifdef HAVE_UTIME + utime(sock_path, NULL); +#else /* !HAVE_UTIME */ +#ifdef HAVE_UTIMES + utimes(sock_path, NULL); +#endif /* HAVE_UTIMES */ +#endif /* HAVE_UTIME */ + } +} + + +/* -------------------------------- + * Low-level I/O routines begin here. + * + * These routines communicate with a frontend client across a connection + * already established by the preceding routines. + * -------------------------------- + */ + + +/* -------------------------------- + * pq_recvbuf - load some bytes into the input buffer + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +static int +pq_recvbuf(Port *myport) +{ + if (myport->PqRecvPointer > 0) + { + if (myport->PqRecvLength > myport->PqRecvPointer) + { + /* still some unread data, left-justify it in the buffer */ + memmove(myport->PqRecvBuffer, myport->PqRecvBuffer + myport->PqRecvPointer, + myport->PqRecvLength - myport->PqRecvPointer); + myport->PqRecvLength -= myport->PqRecvPointer; + myport->PqRecvPointer = 0; + } + else + myport->PqRecvLength = myport->PqRecvPointer = 0; + } + + /* Can fill buffer from myport->PqRecvLength and upwards */ + for (;;) + { + int r; + + r = recv(myport->sock, myport->PqRecvBuffer + myport->PqRecvLength, + PQ_BUFFER_SIZE - myport->PqRecvLength, 0); + + if (r < 0) + { + if (errno == EINTR) + continue; /* Ok if interrupted */ + + /* + * Careful: an ereport() that tries to write to the client would + * cause recursion to here, leading to stack overflow and core + * dump! This message must go *only* to the postmaster log. + */ + ereport(COMMERROR, + (EACCES, + errmsg("could not receive data from client: %m"))); + return EOF; + } + if (r == 0) + { + /* + * EOF detected. We used to write a log message here, but it's + * better to expect the ultimate caller to do that. + */ + return EOF; + } + /* r contains number of bytes read, so just incr length */ + myport->PqRecvLength += r; + return 0; + } +} + +/* -------------------------------- + * pq_getbyte - get a single byte from connection, or return EOF + * -------------------------------- + */ +int +pq_getbyte(Port *myport) +{ + while (myport->PqRecvPointer >= myport->PqRecvLength) + { + if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer++]; +} + +/* -------------------------------- + * pq_peekbyte - peek at next byte from connection + * + * Same as pq_getbyte() except we don't advance the pointer. + * -------------------------------- + */ +int +pq_peekbyte(Port *myport) +{ + while (myport->PqRecvPointer >= myport->PqRecvLength) + { + if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + return (unsigned char) myport->PqRecvBuffer[myport->PqRecvPointer]; +} + +/* -------------------------------- + * pq_getbytes - get a known number of bytes from connection + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_getbytes(Port *myport, char *s, size_t len) +{ + size_t amount; + + while (len > 0) + { + while (myport->PqRecvPointer >= myport->PqRecvLength) + { + if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + amount = myport->PqRecvLength - myport->PqRecvPointer; + if (amount > len) + amount = len; + memcpy(s, myport->PqRecvBuffer + myport->PqRecvPointer, amount); + myport->PqRecvPointer += amount; + s += amount; + len -= amount; + } + return 0; +} + +/* -------------------------------- + * pq_discardbytes - throw away a known number of bytes + * + * same as pq_getbytes except we do not copy the data to anyplace. + * this is used for resynchronizing after read errors. + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +static int +pq_discardbytes(Port *myport, size_t len) +{ + size_t amount; + + while (len > 0) + { + while (myport->PqRecvPointer >= myport->PqRecvLength) + { + if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + amount = myport->PqRecvLength - myport->PqRecvPointer; + if (amount > len) + amount = len; + myport->PqRecvPointer += amount; + len -= amount; + } + return 0; +} + +/* -------------------------------- + * pq_getstring - get a null terminated string from connection + * + * The return value is placed in an expansible StringInfo, which has + * already been initialized by the caller. + * + * This is used only for dealing with old-protocol clients. The idea + * is to produce a StringInfo that looks the same as we would get from + * pq_getmessage() with a newer client; we will then process it with + * pq_getmsgstring. Therefore, no character set conversion is done here, + * even though this is presumably useful only for text. + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_getstring(Port *myport, StringInfo s) +{ + int i; + + resetStringInfo(s); + + /* Read until we get the terminating '\0' */ + for (;;) + { + while (myport->PqRecvPointer >= myport->PqRecvLength) + { + if (pq_recvbuf(myport)) /* If nothing in buffer, then recv some */ + return EOF; /* Failed to recv data */ + } + + for (i = myport->PqRecvPointer; i < myport->PqRecvLength; i++) + { + if (myport->PqRecvBuffer[i] == '\0') + { + /* include the '\0' in the copy */ + appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer, + i - myport->PqRecvPointer + 1); + myport->PqRecvPointer = i + 1; /* advance past \0 */ + return 0; + } + } + + /* If we're here we haven't got the \0 in the buffer yet. */ + appendBinaryStringInfo(s, myport->PqRecvBuffer + myport->PqRecvPointer, + myport->PqRecvLength - myport->PqRecvPointer); + myport->PqRecvPointer = myport->PqRecvLength; + } +} + + +/* -------------------------------- + * pq_getmessage - get a message with length word from connection + * + * The return value is placed in an expansible StringInfo, which has + * already been initialized by the caller. + * Only the message body is placed in the StringInfo; the length word + * is removed. Also, s->cursor is initialized to zero for convenience + * in scanning the message contents. + * + * If maxlen is not zero, it is an upper limit on the length of the + * message we are willing to accept. We abort the connection (by + * returning EOF) if client tries to send more than that. + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_getmessage(Port *myport, StringInfo s, int maxlen) +{ + int32 len; + + resetStringInfo(s); + + /* Read message length word */ + if (pq_getbytes(myport, (char *) &len, 4) == EOF) + { + ereport(COMMERROR, + (EPROTO, + errmsg("unexpected EOF within message length word"))); + return EOF; + } + + len = ntohl(len); + + if (len < 4 || + (maxlen > 0 && len > maxlen)) + { + ereport(COMMERROR, + (EPROTO, + errmsg("invalid message length"))); + return EOF; + } + + len -= 4; /* discount length itself */ + + if (len > 0) + { + /* + * Allocate space for message. If we run out of room (ridiculously + * large message), we will elog(ERROR), but we want to discard the + * message body so as not to lose communication sync. + */ + enlargeStringInfo(s, len); + + /* And grab the message */ + if (pq_getbytes(myport, s->data, len) == EOF) + { + ereport(COMMERROR, + (EPROTO, + errmsg("incomplete message from client"))); + return EOF; + } + s->len = len; + /* Place a trailing null per StringInfo convention */ + s->data[len] = '\0'; + } + + return 0; +} + + +/* -------------------------------- + * pq_putbytes - send bytes to connection (not flushed until pq_flush) + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_putbytes(Port *myport, const char *s, size_t len) +{ + int res; + + res = internal_putbytes(myport, s, len); + return res; +} + +static int +internal_putbytes(Port *myport, const char *s, size_t len) +{ + size_t amount; + + while (len > 0) + { + /* If buffer is full, then flush it out */ + if (myport->PqSendPointer >= PQ_BUFFER_SIZE) + if (internal_flush(myport)) + return EOF; + amount = PQ_BUFFER_SIZE - myport->PqSendPointer; + if (amount > len) + amount = len; + memcpy(myport->PqSendBuffer + myport->PqSendPointer, s, amount); + myport->PqSendPointer += amount; + s += amount; + len -= amount; + } + return 0; +} + +/* -------------------------------- + * pq_flush - flush pending output + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_flush(Port *myport) +{ + int res; + + /* No-op if reentrant call */ + res = internal_flush(myport); + return res; +} + +static int +internal_flush(Port *myport) +{ + static int last_reported_send_errno = 0; + + char *bufptr = myport->PqSendBuffer; + char *bufend = myport->PqSendBuffer + myport->PqSendPointer; + + while (bufptr < bufend) + { + int r; + + r = send(myport->sock, bufptr, bufend - bufptr, 0); + + if (r <= 0) + { + if (errno == EINTR) + continue; /* Ok if we were interrupted */ + + /* + * Careful: an ereport() that tries to write to the client would + * cause recursion to here, leading to stack overflow and core + * dump! This message must go *only* to the postmaster log. + * + * If a client disconnects while we're in the midst of output, we + * might write quite a bit of data before we get to a safe query + * abort point. So, suppress duplicate log messages. + */ + if (errno != last_reported_send_errno) + { + last_reported_send_errno = errno; + ereport(COMMERROR, + (EACCES, + errmsg("could not send data to client: %m"))); + } + + /* + * We drop the buffered data anyway so that processing can + * continue, even though we'll probably quit soon. + */ + myport->PqSendPointer = 0; + return EOF; + } + + last_reported_send_errno = 0; /* reset after any successful send */ + bufptr += r; + } + + myport->PqSendPointer = 0; + return 0; +} + + +/* -------------------------------- + * Message-level I/O routines begin here. + * + * These routines understand about the old-style COPY OUT protocol. + * -------------------------------- + */ + + +/* -------------------------------- + * pq_putmessage - send a normal message (suppressed in COPY OUT mode) + * + * If msgtype is not '\0', it is a message type code to place before + * the message body. If msgtype is '\0', then the message has no type + * code (this is only valid in pre-3.0 protocols). + * + * len is the length of the message body data at *s. In protocol 3.0 + * and later, a message length word (equal to len+4 because it counts + * itself too) is inserted by this routine. + * + * All normal messages are suppressed while old-style COPY OUT is in + * progress. (In practice only a few notice messages might get emitted + * then; dropping them is annoying, but at least they will still appear + * in the postmaster log.) + * + * We also suppress messages generated while pqcomm.c is busy. This + * avoids any possibility of messages being inserted within other + * messages. The only known trouble case arises if SIGQUIT occurs + * during a pqcomm.c routine --- quickdie() will try to send a warning + * message, and the most reasonable approach seems to be to drop it. + * + * returns 0 if OK, EOF if trouble + * -------------------------------- + */ +int +pq_putmessage(Port *myport, char msgtype, const char *s, size_t len) +{ + uint32 n32; + if (msgtype) + if (internal_putbytes(myport, &msgtype, 1)) + goto fail; + + n32 = htonl((uint32) (len + 4)); + if (internal_putbytes(myport, (char *) &n32, 4)) + goto fail; + + if (internal_putbytes(myport, s, len)) + goto fail; + return 0; + +fail: + return EOF; +} + + +/* + * Support for TCP Keepalive parameters + */ + +int +pq_getkeepalivesidle(Port *port) +{ +#ifdef TCP_KEEPIDLE + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return 0; + + if (port->keepalives_idle != 0) + return port->keepalives_idle; + + if (port->default_keepalives_idle == 0) + { + ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_idle); + + if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *) &port->default_keepalives_idle, + &size) < 0) + { + elog(LOG, "getsockopt(TCP_KEEPIDLE) failed: %m"); + port->default_keepalives_idle = -1; /* don't know */ + } + } + + return port->default_keepalives_idle; +#else + return 0; +#endif +} + +int +pq_setkeepalivesidle(int idle, Port *port) +{ + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return STATUS_OK; + +#ifdef TCP_KEEPIDLE + if (idle == port->keepalives_idle) + return STATUS_OK; + + if (port->default_keepalives_idle <= 0) + { + if (pq_getkeepalivesidle(port) < 0) + { + if (idle == 0) + return STATUS_OK; /* default is set but unknown */ + else + return STATUS_ERROR; + } + } + + if (idle == 0) + idle = port->default_keepalives_idle; + + if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *) &idle, sizeof(idle)) < 0) + { + elog(LOG, "setsockopt(TCP_KEEPIDLE) failed: %m"); + return STATUS_ERROR; + } + + port->keepalives_idle = idle; +#else + if (idle != 0) + { + elog(LOG, "setsockopt(TCP_KEEPIDLE) not supported"); + return STATUS_ERROR; + } +#endif + + return STATUS_OK; +} + +int +pq_getkeepalivesinterval(Port *port) +{ +#ifdef TCP_KEEPINTVL + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return 0; + + if (port->keepalives_interval != 0) + return port->keepalives_interval; + + if (port->default_keepalives_interval == 0) + { + ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_interval); + + if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *) &port->default_keepalives_interval, + &size) < 0) + { + elog(LOG, "getsockopt(TCP_KEEPINTVL) failed: %m"); + port->default_keepalives_interval = -1; /* don't know */ + } + } + + return port->default_keepalives_interval; +#else + return 0; +#endif +} + +int +pq_setkeepalivesinterval(int interval, Port *port) +{ + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return STATUS_OK; + +#ifdef TCP_KEEPINTVL + if (interval == port->keepalives_interval) + return STATUS_OK; + + if (port->default_keepalives_interval <= 0) + { + if (pq_getkeepalivesinterval(port) < 0) + { + if (interval == 0) + return STATUS_OK; /* default is set but unknown */ + else + return STATUS_ERROR; + } + } + + if (interval == 0) + interval = port->default_keepalives_interval; + + if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *) &interval, sizeof(interval)) < 0) + { + elog(LOG, "setsockopt(TCP_KEEPINTVL) failed: %m"); + return STATUS_ERROR; + } + + port->keepalives_interval = interval; +#else + if (interval != 0) + { + elog(LOG, "setsockopt(TCP_KEEPINTVL) not supported"); + return STATUS_ERROR; + } +#endif + + return STATUS_OK; +} + +int +pq_getkeepalivescount(Port *port) +{ +#ifdef TCP_KEEPCNT + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return 0; + + if (port->keepalives_count != 0) + return port->keepalives_count; + + if (port->default_keepalives_count == 0) + { + ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_count); + + if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *) &port->default_keepalives_count, + &size) < 0) + { + elog(LOG, "getsockopt(TCP_KEEPCNT) failed: %m"); + port->default_keepalives_count = -1; /* don't know */ + } + } + + return port->default_keepalives_count; +#else + return 0; +#endif +} + +int +pq_setkeepalivescount(int count, Port *port) +{ + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return STATUS_OK; + +#ifdef TCP_KEEPCNT + if (count == port->keepalives_count) + return STATUS_OK; + + if (port->default_keepalives_count <= 0) + { + if (pq_getkeepalivescount(port) < 0) + { + if (count == 0) + return STATUS_OK; /* default is set but unknown */ + else + return STATUS_ERROR; + } + } + + if (count == 0) + count = port->default_keepalives_count; + + if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *) &count, sizeof(count)) < 0) + { + elog(LOG, "setsockopt(TCP_KEEPCNT) failed: %m"); + return STATUS_ERROR; + } + + port->keepalives_count = count; +#else + if (count != 0) + { + elog(LOG, "setsockopt(TCP_KEEPCNT) not supported"); + return STATUS_ERROR; + } +#endif + + return STATUS_OK; +} diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c new file mode 100644 index 0000000000..339f50a995 --- /dev/null +++ b/src/gtm/libpq/pqformat.c @@ -0,0 +1,658 @@ +/*------------------------------------------------------------------------- + * + * pqformat.c + * Routines for formatting and parsing frontend/backend messages + * + * Outgoing messages are built up in a StringInfo buffer (which is expansible) + * and then sent in a single call to pq_putmessage. This module provides data + * formatting/conversion routines that are needed to produce valid messages. + * Note in particular the distinction between "raw data" and "text"; raw data + * is message protocol characters and binary values that are not subject to + * character set conversion, while text is converted by character encoding + * rules. + * + * Incoming messages are similarly read into a StringInfo buffer, via + * pq_getmessage, and then parsed and converted from that using the routines + * in this module. + * + * These same routines support reading and writing of external binary formats + * (typsend/typreceive routines). The conversion routines for individual + * data types are exactly the same, only initialization and completion + * are different. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/backend/libpq/pqformat.c,v 1.48 2009/01/01 17:23:42 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * Message assembly and output: + * pq_beginmessage - initialize StringInfo buffer + * pq_sendbyte - append a raw byte to a StringInfo buffer + * pq_sendint - append a binary integer to a StringInfo buffer + * pq_sendint64 - append a binary 8-byte int to a StringInfo buffer + * pq_sendfloat4 - append a float4 to a StringInfo buffer + * pq_sendfloat8 - append a float8 to a StringInfo buffer + * pq_sendbytes - append raw data to a StringInfo buffer + * pq_sendcountedtext - append a counted text string (with character set conversion) + * pq_sendtext - append a text string (with conversion) + * pq_sendstring - append a null-terminated text string (with conversion) + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * pq_endmessage - send the completed message to the frontend + * Note: it is also possible to append data to the StringInfo buffer using + * the regular StringInfo routines, but this is discouraged since required + * character set conversion may not occur. + * + * typsend support (construct a bytea value containing external binary data): + * pq_begintypsend - initialize StringInfo buffer + * pq_endtypsend - return the completed string as a "bytea*" + * + * Special-case message output: + * pq_puttextmessage - generate a character set-converted message in one step + * pq_putemptymessage - convenience routine for message with empty body + * + * Message parsing after input: + * pq_getmsgbyte - get a raw byte from a message buffer + * pq_getmsgint - get a binary integer from a message buffer + * pq_getmsgint64 - get a binary 8-byte int from a message buffer + * pq_getmsgfloat4 - get a float4 from a message buffer + * pq_getmsgfloat8 - get a float8 from a message buffer + * pq_getmsgbytes - get raw data from a message buffer + * pq_copymsgbytes - copy raw data from a message buffer + * pq_getmsgtext - get a counted text string (with conversion) + * pq_getmsgstring - get a null-terminated text string (with conversion) + * pq_getmsgend - verify message fully consumed + * pq_getmsgunreadlen - get length of the unread data in the message buffer + */ + +#include <sys/param.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "gtm/gtm_c.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include "gtm/gtm.h" +#include "gtm/memutils.h" +#include "gtm/elog.h" + + +/* -------------------------------- + * pq_beginmessage - initialize for sending a message + * -------------------------------- + */ +void +pq_beginmessage(StringInfo buf, char msgtype) +{ + initStringInfo(buf); + + /* + * We stash the message type into the buffer's cursor field, expecting + * that the pq_sendXXX routines won't touch it. We could alternatively + * make it the first byte of the buffer contents, but this seems easier. + */ + buf->cursor = msgtype; +} + +/* -------------------------------- + * pq_sendbyte - append a raw byte to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbyte(StringInfo buf, int byt) +{ + appendStringInfoCharMacro(buf, byt); +} + +/* -------------------------------- + * pq_sendbytes - append raw data to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbytes(StringInfo buf, const char *data, int datalen) +{ + appendBinaryStringInfo(buf, data, datalen); +} + +/* -------------------------------- + * pq_sendcountedtext - append a counted text string (with character set conversion) + * + * The data sent to the frontend by this routine is a 4-byte count field + * followed by the string. The count includes itself or not, as per the + * countincludesself flag (pre-3.0 protocol requires it to include itself). + * The passed text string need not be null-terminated, and the data sent + * to the frontend isn't either. + * -------------------------------- + */ +void +pq_sendcountedtext(StringInfo buf, const char *str, int slen, + bool countincludesself) +{ + int extra = countincludesself ? 4 : 0; + char *p; + + if (p != str) /* actual conversion has been done? */ + { + slen = strlen(p); + pq_sendint(buf, slen + extra, 4); + appendBinaryStringInfo(buf, p, slen); + pfree(p); + } + else + { + pq_sendint(buf, slen + extra, 4); + appendBinaryStringInfo(buf, str, slen); + } +} + +/* -------------------------------- + * pq_sendtext - append a text string (with conversion) + * + * The passed text string need not be null-terminated, and the data sent + * to the frontend isn't either. Note that this is not actually useful + * for direct frontend transmissions, since there'd be no way for the + * frontend to determine the string length. But it is useful for binary + * format conversions. + * -------------------------------- + */ +void +pq_sendtext(StringInfo buf, const char *str, int slen) +{ + char *p; + + if (p != str) /* actual conversion has been done? */ + { + slen = strlen(p); + appendBinaryStringInfo(buf, p, slen); + pfree(p); + } + else + appendBinaryStringInfo(buf, str, slen); +} + +/* -------------------------------- + * pq_sendstring - append a null-terminated text string (with conversion) + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_sendstring(StringInfo buf, const char *str) +{ + int slen = strlen(str); + appendBinaryStringInfo(buf, str, slen + 1); +} + +/* -------------------------------- + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * + * This function intentionally bypasses encoding conversion, instead just + * silently replacing any non-7-bit-ASCII characters with question marks. + * It is used only when we are having trouble sending an error message to + * the client with normal localization and encoding conversion. The caller + * should already have taken measures to ensure the string is just ASCII; + * the extra work here is just to make certain we don't send a badly encoded + * string to the client (which might or might not be robust about that). + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_send_ascii_string(StringInfo buf, const char *str) +{ + while (*str) + { + char ch = *str++; + + if (IS_HIGHBIT_SET(ch)) + ch = '?'; + appendStringInfoCharMacro(buf, ch); + } + appendStringInfoChar(buf, '\0'); +} + +/* -------------------------------- + * pq_sendint - append a binary integer to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendint(StringInfo buf, int i, int b) +{ + unsigned char n8; + uint16 n16; + uint32 n32; + + switch (b) + { + case 1: + n8 = (unsigned char) i; + appendBinaryStringInfo(buf, (char *) &n8, 1); + break; + case 2: + n16 = htons((uint16) i); + appendBinaryStringInfo(buf, (char *) &n16, 2); + break; + case 4: + n32 = htonl((uint32) i); + appendBinaryStringInfo(buf, (char *) &n32, 4); + break; + default: + elog(ERROR, "unsupported integer size %d", b); + break; + } +} + +/* -------------------------------- + * pq_sendint64 - append a binary 8-byte int to a StringInfo buffer + * + * It is tempting to merge this with pq_sendint, but we'd have to make the + * argument int64 for all data widths --- that could be a big performance + * hit on machines where int64 isn't efficient. + * -------------------------------- + */ +void +pq_sendint64(StringInfo buf, int64 i) +{ + uint32 n32; + + /* High order half first, since we're doing MSB-first */ +#ifdef INT64_IS_BUSTED + /* don't try a right shift of 32 on a 32-bit word */ + n32 = (i < 0) ? -1 : 0; +#else + n32 = (uint32) (i >> 32); +#endif + n32 = htonl(n32); + appendBinaryStringInfo(buf, (char *) &n32, 4); + + /* Now the low order half */ + n32 = (uint32) i; + n32 = htonl(n32); + appendBinaryStringInfo(buf, (char *) &n32, 4); +} + +/* -------------------------------- + * pq_sendfloat4 - append a float4 to a StringInfo buffer + * + * The point of this routine is to localize knowledge of the external binary + * representation of float4, which is a component of several datatypes. + * + * We currently assume that float4 should be byte-swapped in the same way + * as int4. This rule is not perfect but it gives us portability across + * most IEEE-float-using architectures. + * -------------------------------- + */ +void +pq_sendfloat4(StringInfo buf, float4 f) +{ + union + { + float4 f; + uint32 i; + } swap; + + swap.f = f; + swap.i = htonl(swap.i); + + appendBinaryStringInfo(buf, (char *) &swap.i, 4); +} + +/* -------------------------------- + * pq_sendfloat8 - append a float8 to a StringInfo buffer + * + * The point of this routine is to localize knowledge of the external binary + * representation of float8, which is a component of several datatypes. + * + * We currently assume that float8 should be byte-swapped in the same way + * as int8. This rule is not perfect but it gives us portability across + * most IEEE-float-using architectures. + * -------------------------------- + */ +void +pq_sendfloat8(StringInfo buf, float8 f) +{ +#ifdef INT64_IS_BUSTED + union + { + float8 f; + uint32 h[2]; + } swap; + + swap.f = f; + swap.h[0] = htonl(swap.h[0]); + swap.h[1] = htonl(swap.h[1]); + +#ifdef WORDS_BIGENDIAN + /* machine seems to be big-endian, send h[0] first */ + appendBinaryStringInfo(buf, (char *) &swap.h[0], 4); + appendBinaryStringInfo(buf, (char *) &swap.h[1], 4); +#else + /* machine seems to be little-endian, send h[1] first */ + appendBinaryStringInfo(buf, (char *) &swap.h[1], 4); + appendBinaryStringInfo(buf, (char *) &swap.h[0], 4); +#endif +#else /* INT64 works */ + union + { + float8 f; + int64 i; + } swap; + + swap.f = f; + pq_sendint64(buf, swap.i); +#endif +} + +/* -------------------------------- + * pq_endmessage - send the completed message to the frontend + * + * The data buffer is pfree()d, but if the StringInfo was allocated with + * makeStringInfo then the caller must still pfree it. + * -------------------------------- + */ +void +pq_endmessage(Port *myport, StringInfo buf) +{ + /* msgtype was saved in cursor field */ + (void) pq_putmessage(myport, buf->cursor, buf->data, buf->len); + /* no need to complain about any failure, since pqcomm.c already did */ + pfree(buf->data); + buf->data = NULL; +} + + +/* -------------------------------- + * pq_puttextmessage - generate a character set-converted message in one step + * + * This is the same as the pqcomm.c routine pq_putmessage, except that + * the message body is a null-terminated string to which encoding + * conversion applies. + * -------------------------------- + */ +void +pq_puttextmessage(Port *myport, char msgtype, const char *str) +{ + int slen = strlen(str); + (void) pq_putmessage(myport, msgtype, str, slen + 1); +} + + +/* -------------------------------- + * pq_putemptymessage - convenience routine for message with empty body + * -------------------------------- + */ +void +pq_putemptymessage(Port *myport, char msgtype) +{ + (void) pq_putmessage(myport, msgtype, NULL, 0); +} + + +/* -------------------------------- + * pq_getmsgbyte - get a raw byte from a message buffer + * -------------------------------- + */ +int +pq_getmsgbyte(StringInfo msg) +{ + if (msg->cursor >= msg->len) + ereport(ERROR, + (EPROTO, + errmsg("no data left in message"))); + return (unsigned char) msg->data[msg->cursor++]; +} + +/* -------------------------------- + * pq_getmsgint - get a binary integer from a message buffer + * + * Values are treated as unsigned. + * -------------------------------- + */ +unsigned int +pq_getmsgint(StringInfo msg, int b) +{ + unsigned int result; + unsigned char n8; + uint16 n16; + uint32 n32; + + switch (b) + { + case 1: + pq_copymsgbytes(msg, (char *) &n8, 1); + result = n8; + break; + case 2: + pq_copymsgbytes(msg, (char *) &n16, 2); + result = ntohs(n16); + break; + case 4: + pq_copymsgbytes(msg, (char *) &n32, 4); + result = ntohl(n32); + break; + default: + elog(ERROR, "unsupported integer size %d", b); + result = 0; /* keep compiler quiet */ + break; + } + return result; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer + * + * It is tempting to merge this with pq_getmsgint, but we'd have to make the + * result int64 for all data widths --- that could be a big performance + * hit on machines where int64 isn't efficient. + * -------------------------------- + */ +int64 +pq_getmsgint64(StringInfo msg) +{ + int64 result; + uint32 h32; + uint32 l32; + + pq_copymsgbytes(msg, (char *) &h32, 4); + pq_copymsgbytes(msg, (char *) &l32, 4); + h32 = ntohl(h32); + l32 = ntohl(l32); + +#ifdef INT64_IS_BUSTED + /* error out if incoming value is wider than 32 bits */ + result = l32; + if ((result < 0) ? (h32 != -1) : (h32 != 0)) + ereport(ERROR, + (ERANGE, + errmsg("binary value is out of range for type bigint"))); +#else + result = h32; + result <<= 32; + result |= l32; +#endif + + return result; +} + +/* -------------------------------- + * pq_getmsgfloat4 - get a float4 from a message buffer + * + * See notes for pq_sendfloat4. + * -------------------------------- + */ +float4 +pq_getmsgfloat4(StringInfo msg) +{ + union + { + float4 f; + uint32 i; + } swap; + + swap.i = pq_getmsgint(msg, 4); + return swap.f; +} + +/* -------------------------------- + * pq_getmsgfloat8 - get a float8 from a message buffer + * + * See notes for pq_sendfloat8. + * -------------------------------- + */ +float8 +pq_getmsgfloat8(StringInfo msg) +{ +#ifdef INT64_IS_BUSTED + union + { + float8 f; + uint32 h[2]; + } swap; + +#ifdef WORDS_BIGENDIAN + /* machine seems to be big-endian, receive h[0] first */ + swap.h[0] = pq_getmsgint(msg, 4); + swap.h[1] = pq_getmsgint(msg, 4); +#else + /* machine seems to be little-endian, receive h[1] first */ + swap.h[1] = pq_getmsgint(msg, 4); + swap.h[0] = pq_getmsgint(msg, 4); +#endif + return swap.f; +#else /* INT64 works */ + union + { + float8 f; + int64 i; + } swap; + + swap.i = pq_getmsgint64(msg); + return swap.f; +#endif +} + +/* -------------------------------- + * pq_getmsgbytes - get raw data from a message buffer + * + * Returns a pointer directly into the message buffer; note this + * may not have any particular alignment. + * -------------------------------- + */ +const char * +pq_getmsgbytes(StringInfo msg, int datalen) +{ + const char *result; + + if (datalen < 0 || datalen > (msg->len - msg->cursor)) + ereport(ERROR, + (EPROTO, + errmsg("insufficient data left in message"))); + result = &msg->data[msg->cursor]; + msg->cursor += datalen; + return result; +} + +/* -------------------------------- + * pq_copymsgbytes - copy raw data from a message buffer + * + * Same as above, except data is copied to caller's buffer. + * -------------------------------- + */ +void +pq_copymsgbytes(StringInfo msg, char *buf, int datalen) +{ + if (datalen < 0 || datalen > (msg->len - msg->cursor)) + ereport(ERROR, + (EPROTO, + errmsg("insufficient data left in message"))); + memcpy(buf, &msg->data[msg->cursor], datalen); + msg->cursor += datalen; +} + +/* -------------------------------- + * pq_getmsgtext - get a counted text string (with conversion) + * + * Always returns a pointer to a freshly palloc'd result. + * The result has a trailing null, *and* we return its strlen in *nbytes. + * -------------------------------- + */ +char * +pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes) +{ + char *str; + char *p; + + if (rawbytes < 0 || rawbytes > (msg->len - msg->cursor)) + ereport(ERROR, + (EPROTO, + errmsg("insufficient data left in message"))); + str = &msg->data[msg->cursor]; + msg->cursor += rawbytes; + + p = (char *) palloc(rawbytes + 1); + memcpy(p, str, rawbytes); + p[rawbytes] = '\0'; + *nbytes = rawbytes; + return p; +} + +/* -------------------------------- + * pq_getmsgstring - get a null-terminated text string (with conversion) + * + * May return a pointer directly into the message buffer, or a pointer + * to a palloc'd conversion result. + * -------------------------------- + */ +const char * +pq_getmsgstring(StringInfo msg) +{ + char *str; + int slen; + + str = &msg->data[msg->cursor]; + + /* + * It's safe to use strlen() here because a StringInfo is guaranteed to + * have a trailing null byte. But check we found a null inside the + * message. + */ + slen = strlen(str); + if (msg->cursor + slen >= msg->len) + ereport(ERROR, + (EPROTO, + errmsg("invalid string in message"))); + msg->cursor += slen + 1; + + return str; +} + +/* -------------------------------- + * pq_getmsgend - verify message fully consumed + * -------------------------------- + */ +void +pq_getmsgend(StringInfo msg) +{ + if (msg->cursor != msg->len) + ereport(ERROR, + (EPROTO, + errmsg("invalid message format"))); +} + +/* -------------------------------- + * pq_getmsgunreadlen - get length of the unread data in the message + * buffer + * -------------------------------- + */ +int +pq_getmsgunreadlen(StringInfo msg) +{ + return msg->len - msg->cursor; +} diff --git a/src/gtm/libpq/pqsignal.c b/src/gtm/libpq/pqsignal.c new file mode 100644 index 0000000000..6bff3d4e14 --- /dev/null +++ b/src/gtm/libpq/pqsignal.c @@ -0,0 +1,181 @@ +/*------------------------------------------------------------------------- + * + * pqsignal.c + * reliable BSD-style signal(2) routine stolen from RWW who stole it + * from Stevens... + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/libpq/pqsignal.c,v 1.44 2008/01/01 19:45:49 momjian Exp $ + * + * NOTES + * This shouldn't be in libpq, but the monitor and some other + * things need it... + * + * A NOTE ABOUT SIGNAL HANDLING ACROSS THE VARIOUS PLATFORMS. + * + * pg_config.h defines the macro HAVE_POSIX_SIGNALS for some platforms and + * not for others. This file and pqsignal.h use that macro to decide + * how to handle signalling. + * + * signal(2) handling - this is here because it affects some of + * the frontend commands as well as the backend server. + * + * Ultrix and SunOS provide BSD signal(2) semantics by default. + * + * SVID2 and POSIX signal(2) semantics differ from BSD signal(2) + * semantics. We can use the POSIX sigaction(2) on systems that + * allow us to request restartable signals (SA_RESTART). + * + * Some systems don't allow restartable signals at all unless we + * link to a special BSD library. + * + * We devoutly hope that there aren't any systems that provide + * neither POSIX signals nor BSD signals. The alternative + * is to do signal-handler reinstallation, which doesn't work well + * at all. + * ------------------------------------------------------------------------*/ + +#include "gtm/gtm.h" + +#include <signal.h> + +#include "gtm/pqsignal.h" + + +#ifdef HAVE_SIGPROCMASK +sigset_t UnBlockSig, + BlockSig, + AuthBlockSig; +#else +int UnBlockSig, + BlockSig, + AuthBlockSig; +#endif + + +/* + * Initialize BlockSig, UnBlockSig, and AuthBlockSig. + * + * BlockSig is the set of signals to block when we are trying to block + * signals. This includes all signals we normally expect to get, but NOT + * signals that should never be turned off. + * + * AuthBlockSig is the set of signals to block during authentication; + * it's essentially BlockSig minus SIGTERM, SIGQUIT, SIGALRM. + * + * UnBlockSig is the set of signals to block when we don't want to block + * signals (is this ever nonzero??) + */ +void +pqinitmask(void) +{ +#ifdef HAVE_SIGPROCMASK + + sigemptyset(&UnBlockSig); + + /* First set all signals, then clear some. */ + sigfillset(&BlockSig); + sigfillset(&AuthBlockSig); + + /* + * Unmark those signals that should never be blocked. Some of these signal + * names don't exist on all platforms. Most do, but might as well ifdef + * them all for consistency... + */ +#ifdef SIGTRAP + sigdelset(&BlockSig, SIGTRAP); + sigdelset(&AuthBlockSig, SIGTRAP); +#endif +#ifdef SIGABRT + sigdelset(&BlockSig, SIGABRT); + sigdelset(&AuthBlockSig, SIGABRT); +#endif +#ifdef SIGILL + sigdelset(&BlockSig, SIGILL); + sigdelset(&AuthBlockSig, SIGILL); +#endif +#ifdef SIGFPE + sigdelset(&BlockSig, SIGFPE); + sigdelset(&AuthBlockSig, SIGFPE); +#endif +#ifdef SIGSEGV + sigdelset(&BlockSig, SIGSEGV); + sigdelset(&AuthBlockSig, SIGSEGV); +#endif +#ifdef SIGBUS + sigdelset(&BlockSig, SIGBUS); + sigdelset(&AuthBlockSig, SIGBUS); +#endif +#ifdef SIGSYS + sigdelset(&BlockSig, SIGSYS); + sigdelset(&AuthBlockSig, SIGSYS); +#endif +#ifdef SIGCONT + sigdelset(&BlockSig, SIGCONT); + sigdelset(&AuthBlockSig, SIGCONT); +#endif + +/* Signals unique to Auth */ +#ifdef SIGQUIT + sigdelset(&AuthBlockSig, SIGQUIT); +#endif +#ifdef SIGTERM + sigdelset(&AuthBlockSig, SIGTERM); +#endif +#ifdef SIGALRM + sigdelset(&AuthBlockSig, SIGALRM); +#endif +#else + /* Set the signals we want. */ + UnBlockSig = 0; + BlockSig = sigmask(SIGQUIT) | + sigmask(SIGTERM) | sigmask(SIGALRM) | + /* common signals between two */ + sigmask(SIGHUP) | + sigmask(SIGINT) | sigmask(SIGUSR1) | + sigmask(SIGUSR2) | sigmask(SIGCHLD) | + sigmask(SIGWINCH) | sigmask(SIGFPE); + AuthBlockSig = sigmask(SIGHUP) | + sigmask(SIGINT) | sigmask(SIGUSR1) | + sigmask(SIGUSR2) | sigmask(SIGCHLD) | + sigmask(SIGWINCH) | sigmask(SIGFPE); +#endif +} + + +/* Win32 signal handling is in backend/port/win32/signal.c */ +#ifndef WIN32 + +/* + * Set up a signal handler + */ +pqsigfunc +pqsignal(int signo, pqsigfunc func) +{ +#if !defined(HAVE_POSIX_SIGNALS) + return signal(signo, func); +#else + struct sigaction act, + oact; + + act.sa_handler = func; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + if (signo != SIGALRM) + act.sa_flags |= SA_RESTART; +#ifdef SA_NOCLDSTOP + if (signo == SIGCHLD) + act.sa_flags |= SA_NOCLDSTOP; +#endif + if (sigaction(signo, &act, &oact) < 0) + return SIG_ERR; + return oact.sa_handler; +#endif /* !HAVE_POSIX_SIGNALS */ +} + +#endif /* WIN32 */ diff --git a/src/gtm/libpq/strlcpy.c b/src/gtm/libpq/strlcpy.c new file mode 100644 index 0000000000..ae031e244c --- /dev/null +++ b/src/gtm/libpq/strlcpy.c @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * strlcpy.c + * strncpy done right + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $ + * + * This file was taken from OpenBSD and is used on platforms that don't + * provide strlcpy(). The OpenBSD copyright terms follow. + *------------------------------------------------------------------------- + */ + +/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */ + +/* + * Copyright (c) 1998 Todd C. Miller <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "gtm/gtm_c.h" + + +/* + * Copy src to string dst of size siz. At most siz-1 characters + * will be copied. Always NUL terminates (unless siz == 0). + * Returns strlen(src); if retval >= siz, truncation occurred. + * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0) + { + while (--n != 0) + { + if ((*d++ = *s++) == '\0') + break; + } + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) + { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return (s - src - 1); /* count does not include NUL */ +} diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile new file mode 100644 index 0000000000..7fcdf82a83 --- /dev/null +++ b/src/gtm/main/Makefile @@ -0,0 +1,22 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o ../common/libgtm.a ../libpq/libpqcomm.a ../path/libgtmpath.a +LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq + +LIBS=-lpthread + +gtm:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm + +all:gtm + +clean: + rm -f $(OBJS) + rm -f gtm + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c new file mode 100644 index 0000000000..73af34efd6 --- /dev/null +++ b/src/gtm/main/gtm_seq.c @@ -0,0 +1,867 @@ +/*------------------------------------------------------------------------- + * + * gtm_seq.c + * Sequence handling on GTM + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/gtm_seq.h" +#include "gtm/assert.h" +#include "gtm/gtm_list.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include "gtm/gtm_msg.h" +#include <unistd.h> + +typedef struct GTM_SeqInfoHashBucket +{ + List *shb_list; + GTM_RWLock shb_lock; +} GTM_SeqInfoHashBucket; + +static int SeqStartMagic = 0xfafafafa; +static int SeqEndMagic = 0xfefefefe; + +#define SEQ_HASH_TABLE_SIZE 1024 +static GTM_SeqInfoHashBucket GTMSequences[SEQ_HASH_TABLE_SIZE]; + +static uint32 seq_gethash(GTM_SequenceKey key); +static bool seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2); +static GTM_SeqInfo *seq_find_seqinfo(GTM_SequenceKey seqkey); +static int seq_release_seqinfo(GTM_SeqInfo *seqinfo); +static int seq_add_seqinfo(GTM_SeqInfo *seqinfo); +static int seq_remove_seqinfo(GTM_SeqInfo *seqinfo); +static GTM_SequenceKey seq_copy_key(GTM_SequenceKey key); + +/* + * Get the hash value given the sequence key + * + * XXX This should probably be replaced by a better hash function. + */ +static uint32 +seq_gethash(GTM_SequenceKey key) +{ + uint32 total = 0; + int ii; + + for (ii = 0; ii < key->gsk_keylen; ii++) + total += key->gsk_key[ii]; + return (total % SEQ_HASH_TABLE_SIZE); +} + +/* + * Return true if both keys are equal, else return false + */ +static bool +seq_keys_equal(GTM_SequenceKey key1, GTM_SequenceKey key2) +{ + Assert(key1); + Assert(key2); + + if (key1->gsk_keylen != key2->gsk_keylen) return false; + + return (memcmp(key1->gsk_key, key2->gsk_key, + Min(key1->gsk_keylen, key2->gsk_keylen)) == 0); +} + +/* + * Find the seqinfo structure for the given key. The reference count is + * incremented before structure is returned. The caller must release the + * reference to the structure when done with it + */ +static GTM_SeqInfo * +seq_find_seqinfo(GTM_SequenceKey seqkey) +{ + uint32 hash = seq_gethash(seqkey); + GTM_SeqInfoHashBucket *bucket; + ListCell *elem; + GTM_SeqInfo *curr_seqinfo = NULL; + + bucket = >MSequences[hash]; + + GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ); + + foreach(elem, bucket->shb_list) + { + curr_seqinfo = (GTM_SeqInfo *) lfirst(elem); + if (seq_keys_equal(curr_seqinfo->gs_key, seqkey)) + break; + curr_seqinfo = NULL; + } + + if (curr_seqinfo != NULL) + { + GTM_RWLockAcquire(&curr_seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + if (curr_seqinfo->gs_state != SEQ_STATE_ACTIVE) + { + elog(LOG, "Sequence not active"); + GTM_RWLockRelease(&curr_seqinfo->gs_lock); + return NULL; + } + Assert(curr_seqinfo->gs_ref_count != SEQ_MAX_REFCOUNT); + curr_seqinfo->gs_ref_count++; + GTM_RWLockRelease(&curr_seqinfo->gs_lock); + } + GTM_RWLockRelease(&bucket->shb_lock); + + return curr_seqinfo; +} + +/* + * Release previously grabbed reference to the structure. If the structure is + * marked for deletion, it will be removed from the global array and released + */ +static int +seq_release_seqinfo(GTM_SeqInfo *seqinfo) +{ + bool remove = false; + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + Assert(seqinfo->gs_ref_count > 0); + seqinfo->gs_ref_count--; + + if ((seqinfo->gs_state == SEQ_STATE_DELETED) && + (seqinfo->gs_ref_count == 0)) + remove = true; + + GTM_RWLockRelease(&seqinfo->gs_lock); + /* + * Remove the structure from the global hash table + */ + if (remove) seq_remove_seqinfo(seqinfo); + return 0; +} + +/* + * Add a seqinfo structure to the global hash table. + */ +static int +seq_add_seqinfo(GTM_SeqInfo *seqinfo) +{ + uint32 hash = seq_gethash(seqinfo->gs_key); + GTM_SeqInfoHashBucket *bucket; + ListCell *elem; + + bucket = >MSequences[hash]; + + GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE); + + foreach(elem, bucket->shb_list) + { + GTM_SeqInfo *curr_seqinfo = NULL; + curr_seqinfo = (GTM_SeqInfo *) lfirst(elem); + + if (seq_keys_equal(curr_seqinfo->gs_key, seqinfo->gs_key)) + { + GTM_RWLockRelease(&bucket->shb_lock); + ereport(LOG, + (EEXIST, + errmsg("Sequence with the given key already exists"))); + return EEXIST; + } + } + + /* + * Safe to add the structure to the list + */ + bucket->shb_list = lappend(bucket->shb_list, seqinfo); + GTM_RWLockRelease(&bucket->shb_lock); + + return 0; +} + +/* + * Remove the seqinfo structure from the global hash table. If the structure is + * currently referenced by some other thread, just mark the structure for + * deletion and it will be deleted by the final reference is released. + */ +static int +seq_remove_seqinfo(GTM_SeqInfo *seqinfo) +{ + uint32 hash = seq_gethash(seqinfo->gs_key); + GTM_SeqInfoHashBucket *bucket; + + bucket = >MSequences[hash]; + + GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_WRITE); + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + if (seqinfo->gs_ref_count > 1) + { + seqinfo->gs_state = SEQ_STATE_DELETED; + GTM_RWLockRelease(&seqinfo->gs_lock); + GTM_RWLockRelease(&bucket->shb_lock); + return EBUSY; + } + + bucket->shb_list = list_delete(bucket->shb_list, seqinfo); + GTM_RWLockRelease(&seqinfo->gs_lock); + GTM_RWLockRelease(&bucket->shb_lock); + + return 0; +} + +static GTM_SequenceKey +seq_copy_key(GTM_SequenceKey key) +{ + GTM_SequenceKey retkey = NULL; + + /* + * We must use the TopMostMemoryContext because the sequence information is + * not bound to a thread and can outlive any of the thread specific + * contextes. + */ + retkey = (GTM_SequenceKey) MemoryContextAlloc(TopMostMemoryContext, + sizeof(GTM_SequenceKeyData) + + key->gsk_keylen); + + if (retkey == NULL) + ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); + + retkey->gsk_keylen = key->gsk_keylen; + retkey->gsk_key = (char *)((char *)retkey + sizeof (GTM_SequenceKeyData)); + + memcpy(retkey->gsk_key, key->gsk_key, key->gsk_keylen); + return retkey; +} + +/* + * Initialize a new sequence. Optionally set the initial value of the sequence. + */ +int +GTM_SeqOpen(GTM_SequenceKey seqkey, + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + bool cycle) +{ + GTM_SeqInfo *seqinfo = NULL; + int errcode = 0; + seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo)); + + if (seqinfo == NULL) + ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); + + GTM_RWLockInit(&seqinfo->gs_lock); + + seqinfo->gs_ref_count = 0; + seqinfo->gs_key = seq_copy_key(seqkey); + seqinfo->gs_state = SEQ_STATE_ACTIVE; + seqinfo->gs_called = false; + + /* + * Set the increment. Default is 1 + */ + if (SEQVAL_IS_VALID(increment_by)) + seqinfo->gs_increment_by = increment_by; + else + seqinfo->gs_increment_by = 1; + + /* + * If minval is specified, set the minvalue to the given minval, + * otherwise set to the defaults + */ + if (SEQVAL_IS_VALID(minval)) + seqinfo->gs_min_value = minval; + else if (SEQ_IS_ASCENDING(seqinfo)) + seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_ASCEND; + else + seqinfo->gs_min_value = SEQ_DEF_MIN_SEQVAL_DESCEND; + + /* + * If maxval is specfied, set the maxvalue to the given maxval, otherwise + * set to the defaults depending on whether the seqeunce is ascending or + * descending. Also do some basic contraint checks + */ + if (SEQVAL_IS_VALID(maxval)) + { + if (maxval < seqinfo->gs_min_value) + ereport(ERROR, + (ERANGE, + errmsg("Max value must be greater than min value"))); + seqinfo->gs_max_value = maxval; + } + else if (SEQ_IS_ASCENDING(seqinfo)) + seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_ASCEND; + else + seqinfo->gs_max_value = SEQ_DEF_MAX_SEQVAL_DESCEND; + + + /* + * Set the startval if specified. Do some basic checks like startval must + * be in-between min and max values + */ + if (SEQVAL_IS_VALID(startval)) + { + if (startval < seqinfo->gs_min_value) + ereport(ERROR, + (ERANGE, + errmsg("Start value must be greater than or equal to the min value"))); + + if (startval > seqinfo->gs_max_value) + ereport(ERROR, + (ERANGE, + errmsg("Start value must be less than or equal to the max value"))); + + seqinfo->gs_init_value = seqinfo->gs_value = startval; + } + else if (SEQ_IS_ASCENDING(seqinfo)) + seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_ASCEND; + else + seqinfo->gs_init_value = seqinfo->gs_value = SEQ_DEF_MIN_SEQVAL_DESCEND; + + /* + * Should we wrap around ? + */ + seqinfo->gs_cycle = cycle; + + if ((errcode = seq_add_seqinfo(seqinfo))) + { + GTM_RWLockDestroy(&seqinfo->gs_lock); + pfree(seqinfo->gs_key); + pfree(seqinfo); + } + return errcode; +} + +/* + * Restore a sequence. + */ +static int +GTM_SeqRestore(GTM_SequenceKey seqkey, + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + GTM_Sequence curval, + int32 state, + bool cycle, + bool called) +{ + GTM_SeqInfo *seqinfo = NULL; + int errcode = 0; + seqinfo = (GTM_SeqInfo *) palloc(sizeof (GTM_SeqInfo)); + + if (seqinfo == NULL) + ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); + + GTM_RWLockInit(&seqinfo->gs_lock); + + seqinfo->gs_ref_count = 0; + seqinfo->gs_key = seq_copy_key(seqkey); + seqinfo->gs_state = state; + seqinfo->gs_called = called; + + seqinfo->gs_increment_by = increment_by; + seqinfo->gs_min_value = minval; + seqinfo->gs_max_value = maxval; + + seqinfo->gs_init_value = startval; + seqinfo->gs_value = curval; + + /* + * Should we wrap around ? + */ + seqinfo->gs_cycle = cycle; + + if ((errcode = seq_add_seqinfo(seqinfo))) + { + GTM_RWLockDestroy(&seqinfo->gs_lock); + pfree(seqinfo->gs_key); + pfree(seqinfo); + } + return errcode; +} +/* + * Destroy the given sequence + */ +int +GTM_SeqClose(GTM_SequenceKey seqkey) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + if (seqinfo != NULL) + { + seq_remove_seqinfo(seqinfo); + pfree(seqinfo->gs_key); + pfree(seqinfo); + return 0; + } + else + return EINVAL; +} + +/* + * Get current value for the sequence without incrementing it + */ +GTM_Sequence +GTM_SeqGetCurrent(GTM_SequenceKey seqkey) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + GTM_Sequence value; + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + return EINVAL; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + /* + * If this is the first call to the sequence, set the value to the start + * value and mark the sequence as 'called' + */ + if (!SEQ_IS_CALLED(seqinfo)) + { + seqinfo->gs_value = seqinfo->gs_init_value; + seqinfo->gs_called = true; + } + value = seqinfo->gs_value; + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + return value; +} + +/* + * Get next vlaue for the sequence + */ +GTM_Sequence +GTM_SeqGetNext(GTM_SequenceKey seqkey) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + GTM_Sequence value; + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + return EINVAL; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + /* + * If the sequence is called for the first time, initialize the value and + * return the start value + */ + if (!SEQ_IS_CALLED(seqinfo)) + { + value = seqinfo->gs_value = seqinfo->gs_init_value; + seqinfo->gs_called = true; + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + return value; + } + + if (SEQ_IS_ASCENDING(seqinfo)) + { + /* + * Check if the sequence is about to wrap-around. If the sequence does + * not support wrap-around, throw an error and return + * InvalidSequenceValue + */ + if (seqinfo->gs_max_value - seqinfo->gs_increment_by >= seqinfo->gs_value) + value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by; + else if (SEQ_IS_CYCLE(seqinfo)) + value = seqinfo->gs_value = seqinfo->gs_min_value; + else + { + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + ereport(LOG, + (ERANGE, + errmsg("Sequence reached maximum value"))); + return InvalidSequenceValue; + } + } + else + { + /* + * Check if the sequence is about to wrap-around. If the sequence does + * not support wrap-around, throw an error and return + * InvalidSequenceValue, otherwise wrap around the sequence and reset + * it to the max value. + * + * Note: The gs_increment_by is a signed integer and is negative for + * descending sequences. So we don't need special handling below + */ + if (seqinfo->gs_min_value - seqinfo->gs_increment_by <= seqinfo->gs_value) + value = seqinfo->gs_value = seqinfo->gs_value + seqinfo->gs_increment_by; + else if (SEQ_IS_CYCLE(seqinfo)) + value = seqinfo->gs_value = seqinfo->gs_max_value; + else + { + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + ereport(LOG, + (ERANGE, + errmsg("Sequence reached minimum value"))); + return InvalidSequenceValue; + } + + } + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + return value; +} + +/* + * Reset the sequence + */ +int +GTM_SeqReset(GTM_SequenceKey seqkey) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + return EINVAL; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + seqinfo->gs_value = seqinfo->gs_init_value; + GTM_RWLockRelease(&seqinfo->gs_lock); + + seq_release_seqinfo(seqinfo); + return 0; +} + +void +GTM_InitSeqManager(void) +{ + int ii; + + for (ii = 0; ii < SEQ_HASH_TABLE_SIZE; ii++) + { + GTMSequences[ii].shb_list = NIL; + GTM_RWLockInit(>MSequences[ii].shb_lock); + } +} + +/* + * Process MSG_SEQUENCE_INIT message + */ +void +ProcessSequenceInitCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + GTM_Sequence increment, minval, maxval, startval; + bool cycle; + StringInfoData buf; + int errcode; + MemoryContext oldContext; + + /* + * Get the sequence key + */ + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + /* + * Read various sequence parameters + */ + memcpy(&increment, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + memcpy(&minval, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + memcpy(&maxval, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + memcpy(&startval, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + + cycle = pq_getmsgbyte(message); + + + /* + * We must use the TopMostMemoryContext because the sequence information is + * not bound to a thread and can outlive any of the thread specific + * contextes. + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + if (GTM_SeqOpen(&seqkey, increment, minval, maxval, startval, cycle)) + ereport(ERROR, + (errcode, + errmsg("Failed to open a new sequence"))); + + MemoryContextSwitchTo(oldContext); + + pq_getmsgend(message); + + /* + * Send a SUCCESS message back to the client + */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_INIT_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); +} + +/* + * Process MSG_SEQUENCE_GET_CURRENT message + */ +void +ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + GTM_Sequence seqval; + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + seqval = GTM_SeqGetCurrent(&seqkey); + if (!SEQVAL_IS_VALID(seqval)) + ereport(ERROR, + (ERANGE, + errmsg("Can not get current value of the sequence"))); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); +} + +/* + * Process MSG_SEQUENCE_GET_NEXT message + */ +void +ProcessSequenceGetNextCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + GTM_Sequence seqval; + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + seqval = GTM_SeqGetNext(&seqkey); + if (!SEQVAL_IS_VALID(seqval)) + ereport(ERROR, + (ERANGE, + errmsg("Can not get current value of the sequence"))); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); +} + +/* + * Process MSG_SEQUENCE_RESET message + */ +void +ProcessSequenceResetCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + int errcode; + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + if ((errcode = GTM_SeqReset(&seqkey))) + ereport(ERROR, + (errcode, + errmsg("Can not reset the sequence"))); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); +} + +/* + * Process MSG_SEQUENCE_CLOSE message + */ +void +ProcessSequenceCloseCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + int errcode; + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + if ((errcode = GTM_SeqClose(&seqkey))) + ereport(ERROR, + (errcode, + errmsg("Can not close the sequence"))); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); +} + +void +GTM_SaveSeqInfo(int ctlfd) +{ + GTM_SeqInfoHashBucket *bucket; + ListCell *elem; + GTM_SeqInfo *seqinfo = NULL; + int hash; + + for (hash = 0; hash < SEQ_HASH_TABLE_SIZE; hash++) + { + bucket = >MSequences[hash]; + + GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ); + + foreach(elem, bucket->shb_list) + { + seqinfo = (GTM_SeqInfo *) lfirst(elem); + if (seqinfo == NULL) + break; + + if (seqinfo->gs_state == SEQ_STATE_DELETED) + continue; + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ); + + write(ctlfd, &SeqStartMagic, sizeof (SeqStartMagic)); + write(ctlfd, &seqinfo->gs_key->gsk_keylen, sizeof (uint32)); + write(ctlfd, seqinfo->gs_key->gsk_key, seqinfo->gs_key->gsk_keylen); + write(ctlfd, &seqinfo->gs_value, sizeof (GTM_Sequence)); + write(ctlfd, &seqinfo->gs_init_value, sizeof (GTM_Sequence)); + write(ctlfd, &seqinfo->gs_increment_by, sizeof (GTM_Sequence)); + write(ctlfd, &seqinfo->gs_min_value, sizeof (GTM_Sequence)); + write(ctlfd, &seqinfo->gs_max_value, sizeof (GTM_Sequence)); + write(ctlfd, &seqinfo->gs_cycle, sizeof (bool)); + write(ctlfd, &seqinfo->gs_called, sizeof (bool)); + write(ctlfd, &seqinfo->gs_state, sizeof (int32)); + write(ctlfd, &SeqEndMagic, sizeof(SeqEndMagic)); + + GTM_RWLockRelease(&seqinfo->gs_lock); + } + + GTM_RWLockRelease(&bucket->shb_lock); + } + +} + +void +GTM_RestoreSeqInfo(int ctlfd) +{ + int magic; + + if (ctlfd == -1) + return; + + while (read(ctlfd, &magic, sizeof (SeqStartMagic)) == sizeof (SeqStartMagic)) + { + GTM_SequenceKeyData seqkey; + GTM_Sequence increment_by; + GTM_Sequence minval; + GTM_Sequence maxval; + GTM_Sequence startval; + GTM_Sequence curval; + int32 state; + bool cycle; + bool called; + + if (magic != SeqStartMagic) + { + elog(LOG, "Start magic mismatch %x - %x", magic, SeqStartMagic); + break; + } + + if (read(ctlfd, &seqkey.gsk_keylen, sizeof (uint32)) != sizeof (uint32)) + { + elog(LOG, "Failed to read keylen"); + break; + } + + seqkey.gsk_key = palloc(seqkey.gsk_keylen); + read(ctlfd, seqkey.gsk_key, seqkey.gsk_keylen); + + read(ctlfd, &curval, sizeof (GTM_Sequence)); + read(ctlfd, &startval, sizeof (GTM_Sequence)); + read(ctlfd, &increment_by, sizeof (GTM_Sequence)); + read(ctlfd, &minval, sizeof (GTM_Sequence)); + read(ctlfd, &maxval, sizeof (GTM_Sequence)); + read(ctlfd, &cycle, sizeof (bool)); + read(ctlfd, &called, sizeof (bool)); + read(ctlfd, &state, sizeof (int32)); + read(ctlfd, &magic, sizeof(SeqEndMagic)); + + if (magic != SeqEndMagic) + { + elog(WARNING, "Corrupted control file"); + return; + } + + GTM_SeqRestore(&seqkey, increment_by, minval, maxval, startval, curval, + state, cycle, called); + } +} diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c new file mode 100644 index 0000000000..5c9b4b2ae5 --- /dev/null +++ b/src/gtm/main/gtm_snap.c @@ -0,0 +1,466 @@ +/*------------------------------------------------------------------------- + * + * gtm_snap.c + * Snapshot handling on GTM + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/elog.h" +#include "gtm/palloc.h" +#include "gtm/gtm.h" +#include "gtm/gtm_txn.h" +#include "gtm/assert.h" +#include "gtm/stringinfo.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include "gtm/gtm_msg.h" + + +/* + * Get snapshot for the given transactions. If this is the first call in the + * transaction, a fresh snapshot is taken and returned back. For a serializable + * transaction, repeated calls to the function will return the same snapshot. + * For a read-committed transaction, fresh snapshot is taken every time and + * returned to the caller. + * + * The returned snapshot includes xmin (lowest still-running xact ID), + * xmax (highest completed xact ID + 1), and a list of running xact IDs + * in the range xmin <= xid < xmax. It is used as follows: + * All xact IDs < xmin are considered finished. + * All xact IDs >= xmax are considered still running. + * For an xact ID xmin <= xid < xmax, consult list to see whether + * it is considered running or not. + * This ensures that the set of transactions seen as "running" by the + * current xact will not change after it takes the snapshot. + * + * All running top-level XIDs are included in the snapshot. + * + * We also update the following global variables: + * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all + * running transactions + * + * Note: this function should probably not be called with an argument that's + * not statically allocated (see xip allocation below). + */ +GTM_Snapshot +GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], int txn_count, int *status) +{ + GlobalTransactionId xmin; + GlobalTransactionId xmax; + GlobalTransactionId globalxmin; + int count = 0; + ListCell *elem = NULL; + int ii; + + /* + * Instead of allocating memory for a snapshot, we use the snapshot of the + * first transaction in the given array. The same snapshot will later be + * copied to other transaction info structures. + */ + GTM_TransactionInfo *mygtm_txninfo = NULL; + GTM_Snapshot snapshot = NULL; + + memset(status, 0, sizeof (int) * txn_count); + + for (ii = 0; ii < txn_count; ii++) + { + mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]); + + /* + * If the transaction does not exist, just mark the status field with + * a STATUS_ERROR code + */ + if (mygtm_txninfo == NULL) + status[ii] = STATUS_ERROR; + else if (snapshot == NULL) + snapshot = &mygtm_txninfo->gti_current_snapshot; + } + + /* + * If no valid transaction exists in the array, send an error message back. + * Otherwise, we should still get the snapshot and send it back. The + * invalid transaction ids are marked separately in the status array. + */ + if (snapshot == NULL) + return NULL; + + Assert(snapshot != NULL); + + if (snapshot->sn_xip == NULL) + { + /* + * First call for this snapshot + */ + snapshot->sn_xip = (GlobalTransactionId *) + palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId)); + if (snapshot->sn_xip == NULL) + ereport(ERROR, + (ENOMEM, + errmsg("out of memory"))); + } + + /* + * It is sufficient to get shared lock on ProcArrayLock, even if we are + * going to set MyProc->xmin. + */ + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ); + + /* xmax is always latestCompletedXid + 1 */ + xmax = GTMTransactions.gt_latestCompletedXid; + Assert(GlobalTransactionIdIsNormal(xmax)); + GlobalTransactionIdAdvance(xmax); + + /* initialize xmin calculation with xmax */ + globalxmin = xmin = xmax; + + /* + * Spin over transaction list checking xid, xmin, and subxids. The goal is to + * gather all active xids and find the lowest xmin + */ + foreach(elem, GTMTransactions.gt_open_transactions) + { + volatile GTM_TransactionInfo *gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem); + GlobalTransactionId xid; + + /* Don't take into account LAZY VACUUMs */ + if (gtm_txninfo->gti_vacuum) + continue; + + /* Update globalxmin to be the smallest valid xmin */ + xid = gtm_txninfo->gti_xmin; /* fetch just once */ + if (GlobalTransactionIdIsNormal(xid) && + GlobalTransactionIdPrecedes(xid, globalxmin)) + globalxmin = xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = gtm_txninfo->gti_gxid; + + /* + * If the transaction has been assigned an xid < xmax we add it to the + * snapshot, and update xmin if necessary. There's no need to store + * XIDs >= xmax, since we'll treat them as running anyway. We don't + * bother to examine their subxids either. + * + * We don't include our own XID (if any) in the snapshot, but we must + * include it into xmin. + */ + if (GlobalTransactionIdIsNormal(xid)) + { + /* + * Unlike Postgres, we include the GXID of the current transaction + * as well in the snapshot. This is necessary because the same + * snapshot is shared by multiple backends through GTM proxy and + * the GXID will vary for each backend. + * + * XXX We should confirm that this does not have any adverse effect + * on the MVCC visibility and check if any changes are related to + * the MVCC checks because of the change + */ + if (GlobalTransactionIdFollowsOrEquals(xid, xmax)) + continue; + if (GlobalTransactionIdPrecedes(xid, xmin)) + xmin = xid; + snapshot->sn_xip[count++] = xid; + } + } + + /* + * Update globalxmin to include actual process xids. This is a slightly + * different way of computing it than GetOldestXmin uses, but should give + * the same result. + */ + if (GlobalTransactionIdPrecedes(xmin, globalxmin)) + globalxmin = xmin; + + GTMTransactions.gt_recent_global_xmin = globalxmin; + + snapshot->sn_xmin = xmin; + snapshot->sn_xmax = xmax; + snapshot->sn_xcnt = count; + snapshot->sn_recent_global_xmin = globalxmin; + + /* + * Now, before the proc array lock is released, set the xmin in the txninfo + * structures of all the transactions. + */ + for (ii = 0; ii < txn_count; ii++) + { + GTM_Snapshot mysnap = NULL; + + /* + * We have already gone through all the transaction handles above and + * marked the invalid handles with STATUS_ERROR + */ + if (status[ii] == STATUS_ERROR) + continue; + + mygtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]); + mysnap = &mygtm_txninfo->gti_current_snapshot; + + if (GTM_IsTransSerializable(mygtm_txninfo)) + { + if ((mygtm_txninfo->gti_snapshot_set) && (txn_count > 1)) + elog(ERROR, "Grouped snapshot can only include first snapshot in Serializable transaction"); + + if (!mygtm_txninfo->gti_snapshot_set) + { + /* + * For the first transaction in the array, the snapshot is + * already set. + */ + if (snapshot != mysnap) + { + if (mysnap->sn_xip == NULL) + { + /* + * First call for this snapshot + */ + mysnap->sn_xip = (GlobalTransactionId *) + palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId)); + if (mysnap->sn_xip == NULL) + ereport(ERROR, (ENOMEM, errmsg("out of memory"))); + } + mysnap->sn_xmin = snapshot->sn_xmin; + mysnap->sn_xmax = snapshot->sn_xmax; + mysnap->sn_xcnt = snapshot->sn_xcnt; + mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin; + memcpy(mysnap->sn_xip, snapshot->sn_xip, + sizeof (GlobalTransactionId) * snapshot->sn_xcnt); + } + mygtm_txninfo->gti_snapshot_set = true; + } + } + else if (snapshot != mysnap) + { + if (mysnap->sn_xip == NULL) + { + /* + * First call for this snapshot + */ + mysnap->sn_xip = (GlobalTransactionId *) + palloc(GTM_MAX_GLOBAL_TRANSACTIONS * sizeof(GlobalTransactionId)); + if (mysnap->sn_xip == NULL) + ereport(ERROR, (ENOMEM, errmsg("out of memory"))); + } + mysnap->sn_xmin = snapshot->sn_xmin; + mysnap->sn_xmax = snapshot->sn_xmax; + mysnap->sn_xcnt = snapshot->sn_xcnt; + mysnap->sn_recent_global_xmin = snapshot->sn_recent_global_xmin; + memcpy(mysnap->sn_xip, snapshot->sn_xip, + sizeof (GlobalTransactionId) * snapshot->sn_xcnt); + } + + if ((mygtm_txninfo != NULL) && + (!GlobalTransactionIdIsValid(mygtm_txninfo->gti_xmin))) + mygtm_txninfo->gti_xmin = xmin; + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + + elog(DEBUG1, "GTM_GetTransactionSnapshot: (%u:%u:%u:%u)", + snapshot->sn_xmin, snapshot->sn_xmax, + snapshot->sn_xcnt, snapshot->sn_recent_global_xmin); + return snapshot; +} + +/* + * Process MSG_SNAPSHOT_GET command + */ +void +ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + int isgxid = 0; + GTM_Snapshot snapshot; + MemoryContext oldContext; + bool canbe_grouped; + int status; + int txn_count = 1; + + /* + * This is used by the GTM proxy to decide whether to group this snapshot + * request with some other snapshot request from some other backend. + * + * This is mostly useless for the GTM server. + */ + canbe_grouped = pq_getmsgbyte(message); + + isgxid = pq_getmsgbyte(message); + + if (isgxid) + { + const char *data = NULL; + Assert(!get_gxid); + data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + txn = GTM_GXIDToHandle(gxid); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + } + pq_getmsgend(message); + + if (get_gxid) + { + Assert(!isgxid); + gxid = GTM_GetGlobalTransactionId(txn); + if (gxid == InvalidGlobalTransactionId) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a new transaction id"))); + } + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + /* + * Get a fresh snapshot + */ + if ((snapshot = GTM_GetTransactionSnapshot(&txn, 1, &status)) == NULL) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a snapshot"))); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, get_gxid ? SNAPSHOT_GXID_GET_RESULT : SNAPSHOT_GET_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); + pq_sendbytes(&buf, (char *)&status, sizeof(int) * txn_count); + pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId)); + pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int)); + pq_sendbytes(&buf, (char *)snapshot->sn_xip, + sizeof(GlobalTransactionId) * snapshot->sn_xcnt); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + + return; +} + +/* + * Process MSG_SNAPSHOT_GET_MULTI command + */ +void +ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS]; + GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + GTM_Snapshot snapshot; + MemoryContext oldContext; + int txn_count; + int ii; + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + + txn_count = pq_getmsgint(message, sizeof (int)); + + for (ii = 0; ii < txn_count; ii++) + { + isgxid[ii] = pq_getmsgbyte(message); + if (isgxid[ii]) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid[ii], data, sizeof (gxid[ii])); + txn[ii] = GTM_GXIDToHandle(gxid[ii]); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn[ii], data, sizeof (txn[ii])); + } + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + /* + * Get a fresh snapshot + */ + if ((snapshot = GTM_GetTransactionSnapshot(txn, txn_count, status)) == NULL) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a snapshot"))); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SNAPSHOT_GET_MULTI_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); + pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count); + pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&snapshot->sn_recent_global_xmin, sizeof (GlobalTransactionId)); + pq_sendint(&buf, snapshot->sn_xcnt, sizeof (int)); + pq_sendbytes(&buf, (char *)snapshot->sn_xip, + sizeof(GlobalTransactionId) * snapshot->sn_xcnt); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + + return; +} + +/* + * Free the snapshot data. The snapshot itself is not freed though + */ +void +GTM_FreeSnapshotData(GTM_Snapshot snapshot) +{ + if (snapshot == NULL) + return; + + if (snapshot->sn_xip != NULL) + { + Assert(snapshot->sn_xcnt); + pfree(snapshot->sn_xip); + snapshot->sn_xip = NULL; + } +} diff --git a/src/gtm/main/gtm_stat.c b/src/gtm/main/gtm_stat.c new file mode 100644 index 0000000000..fac6b64c24 --- /dev/null +++ b/src/gtm/main/gtm_stat.c @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------- + * + * gtm_stat.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" + +uint32 GTM_Message_Stats[MSG_MAX_MESSAGE_TYPE]; +uint32 GTM_Result_Stats[GTM_MAX_RESULT_TYPE]; + +void +gtm_msgstat_increment(int type) +{ + GTM_Message_Stats[type]++; +} + +void +gtm_resultstat_increment(int type) +{ + GTM_Result_Stats[type]++; +} + +void +gtm_print_stats(void) +{ + +} diff --git a/src/gtm/main/gtm_stats.c b/src/gtm/main/gtm_stats.c new file mode 100644 index 0000000000..aba1a219fb --- /dev/null +++ b/src/gtm/main/gtm_stats.c @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * gtm_stats.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +typedef struct GTM_Stats +{ + int GTM_RecvMessages[GTM_MAX_MESSAGE_TYPE]; + int GTM_SentMessages[GTM_MAX_MESSAGE_TYPE]; + float GTM_RecvBytes; + float GTM_SentBytes; +} GTM_Stats; + + diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c new file mode 100644 index 0000000000..61ea640ab5 --- /dev/null +++ b/src/gtm/main/gtm_thread.c @@ -0,0 +1,336 @@ +/*------------------------------------------------------------------------- + * + * gtm_thread.c + * Thread handling + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include <pthread.h> +#include "gtm/gtm.h" +#include "gtm/memutils.h" +#include "gtm/gtm_txn.h" +#include "gtm/libpq.h" + +static void *GTM_ThreadMainWrapper(void *argp); +static void GTM_ThreadCleanup(void *argp); + +GTM_Threads GTMThreadsData; +GTM_Threads *GTMThreads = >MThreadsData; + +#define GTM_MIN_THREADS 32 /* Provision for minimum threads */ +#define GTM_MAX_THREADS 1024 /* Max threads allowed in the GTM */ +#define GTMThreadsFull (GTMThreads->gt_thread_count == GTMThreads->gt_array_size) + +/* + * Add the given thrinfo structure to the global array, expanding it if + * necessary + */ +int +GTM_ThreadAdd(GTM_ThreadInfo *thrinfo) +{ + int ii; + + GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_WRITE); + + if (GTMThreadsFull) + { + uint32 newsize; + + /* + * TODO Optimize lock management by not holding any locks during memory + * allocation + */ + if (GTMThreads->gt_array_size == GTM_MAX_THREADS) + elog(ERROR, "Too many threads active"); + + if (GTMThreads->gt_array_size == 0) + newsize = GTM_MIN_THREADS; + else + { + /* + * We ran out of the array size. Just double the size, bound by the + * upper limit + */ + newsize = GTMThreads->gt_array_size * 2; + } + + /* Can't have more than GTM_MAX_THREADS */ + if (newsize > GTM_MAX_THREADS) + newsize = GTM_MAX_THREADS; + + if (GTMThreads->gt_threads == NULL) + GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize); + else + { + void *old_ptr = GTMThreads->gt_threads; + GTMThreads->gt_threads = (GTM_ThreadInfo **)palloc0(sizeof (GTM_ThreadInfo *) * newsize); + memcpy(GTMThreads->gt_threads, old_ptr, + GTMThreads->gt_array_size * sizeof (GTM_ThreadInfo *)); + pfree(old_ptr); + } + + GTMThreads->gt_array_size = newsize; + } + + /* + * Now that we have free entries in the array, find a free slot and add the + * thrinfo pointer to it. + * + * TODO Optimize this later by tracking few free slots and reusing them. + * The free slots can be updated when a thread exits and reused when a new + * thread is added to the pool. + */ + for (ii = 0; ii < GTMThreads->gt_array_size; ii++) + { + if (GTMThreads->gt_threads[ii] == NULL) + { + GTMThreads->gt_threads[ii] = thrinfo; + GTMThreads->gt_thread_count++; + break; + } + } + GTM_RWLockRelease(>MThreads->gt_lock); + + /* + * Track the slot information in the thrinfo. This is useful to quickly + * find the slot given the thrinfo structure. + */ + thrinfo->thr_localid = ii; + return ii; +} + +int +GTM_ThreadRemove(GTM_ThreadInfo *thrinfo) +{ + int ii; + GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_WRITE); + + for (ii = 0; ii < GTMThreads->gt_array_size; ii++) + { + if (GTMThreads->gt_threads[ii] == thrinfo) + break; + } + + if (ii == GTMThreads->gt_array_size) + elog(ERROR, "Thread (%p) not found ", thrinfo); + + GTMThreads->gt_threads[ii] = NULL; + GTMThreads->gt_thread_count--; + GTM_RWLockRelease(>MThreads->gt_lock); + + pfree(thrinfo); + + return 0; +} + +/* + * Create a new thread and assign the given connection to it. + * + * This function is responsible for setting up the various memory contextes for + * the thread as well as registering this thread with the Thread Manager. + * + * Upon successful creation, the thread will start running the given + * "startroutine". The thread information is returned to the calling process. + */ +GTM_ThreadInfo * +GTM_ThreadCreate(GTM_ConnectionInfo *conninfo, + void *(* startroutine)(void *)) +{ + GTM_ThreadInfo *thrinfo; + int err; + + /* + * We are still running in the context of the main thread. So the + * allocation below would last as long as the main thread exists or the + * memory is explicitely freed. + */ + thrinfo = (GTM_ThreadInfo *)palloc0(sizeof (GTM_ThreadInfo)); + + thrinfo->thr_conn = conninfo; + GTM_RWLockInit(&thrinfo->thr_lock); + + /* + * The thread status is set to GTM_THREAD_STARTING and will be changed by + * the thread itself when it actually starts executing + */ + thrinfo->thr_status = GTM_THREAD_STARTING; + + /* + * Install the ThreadInfo structure in the global array. We do this before + * starting the thread + */ + if (GTM_ThreadAdd(thrinfo) == -1) + elog(ERROR, "Error starting a new thread"); + + /* + * Set up memory contextes before actually starting the threads + * + * The TopThreadContext is a child of TopMemoryContext and it will last as + * long as the main process or this thread lives + * + * Thread context is not shared between other threads + */ + thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext, + "TopMemoryContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + /* + * Since the thread is not yes started, TopMemoryContext still points to + * the context of the calling thread + */ + thrinfo->thr_parent_context = TopMemoryContext; + + /* + * Each thread gets its own ErrorContext and its a child of ErrorContext of + * the main process + * + * This is a thread-specific context and is not shared between other + * threads + */ + thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext, + "ErrorContext", + 8 * 1024, + 8 * 1024, + 8 * 1024, + false); + + thrinfo->thr_startroutine = startroutine; + + /* + * Now start the thread. The thread will start executing the given + * "startroutine". The thrinfo structure is also passed to the thread. Any + * additional parameters should be passed via the thrinfo strcuture. + * + * Return the thrinfo structure to the caller + */ + if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper, + thrinfo))) + ereport(ERROR, + (err, + errmsg("Failed to create a new thread: error %d", err))); + + return thrinfo; +} + +/* + * Exit the current thread + */ +void +GTM_ThreadExit(void) +{ + /* XXX To be implemented */ +} + +int +GTM_ThreadJoin(GTM_ThreadInfo *thrinfo) +{ + int error; + void *data; + + error = pthread_join(thrinfo->thr_id, &data); + + return error; +} + +/* + * Get thread information for the given thread, identified by the + * thread_id + */ +GTM_ThreadInfo * +GTM_GetThreadInfo(GTM_ThreadID thrid) +{ + + return NULL; +} + +/* + * Cleanup routine for the thread + */ +static void +GTM_ThreadCleanup(void *argp) +{ + GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; + + elog(LOG, "Cleaning up thread state"); + + /* + * TODO Close the open connection. + */ + StreamClose(thrinfo->thr_conn->con_port->sock); + + /* Free the port */ + ConnFree(thrinfo->thr_conn->con_port); + thrinfo->thr_conn->con_port = NULL; + + /* Free the connection info structure */ + pfree(thrinfo->thr_conn); + thrinfo->thr_conn = NULL; + + /* + * Switch to the memory context of the main process so that we can free up + * our memory contextes easily. + * + * XXX We don't setup cleanup handlers for the main process. So this + * routine would never be called for the main process/thread + */ + MemoryContextSwitchTo(thrinfo->thr_parent_context); + + MemoryContextDelete(thrinfo->thr_message_context); + thrinfo->thr_message_context = NULL; + + MemoryContextDelete(thrinfo->thr_error_context); + thrinfo->thr_error_context = NULL; + + MemoryContextDelete(thrinfo->thr_thread_context); + thrinfo->thr_thread_context = NULL; + + /* + * TODO Now cleanup the thrinfo structure itself and remove it from the global + * array. + */ + GTM_ThreadRemove(thrinfo); + + /* + * Reset the thread-specific information. This should be done only after we + * are sure that memory contextes are not required + * + * Note: elog calls need memory contextes, so no elog calls beyond this + * point. + */ + SetMyThreadInfo(NULL); + + return; +} + +/* + * A wrapper around the start routine of the thread. This helps us doing any + * initialization and setting up cleanup handlers before the main routine is + * started + */ +void * +GTM_ThreadMainWrapper(void *argp) +{ + GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; + + pthread_detach(thrinfo->thr_id); + + SetMyThreadInfo(thrinfo); + MemoryContextSwitchTo(TopMemoryContext); + + pthread_cleanup_push(GTM_ThreadCleanup, thrinfo); + thrinfo->thr_startroutine(thrinfo); + pthread_cleanup_pop(1); + + return thrinfo; +} diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c new file mode 100644 index 0000000000..6090ae10fb --- /dev/null +++ b/src/gtm/main/gtm_txn.c @@ -0,0 +1,1521 @@ +/*------------------------------------------------------------------------- + * + * gtm_txn.c + * Transaction handling + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" +#include "gtm/elog.h" +#include "gtm/palloc.h" +#include "gtm/gtm.h" +#include "gtm/gtm_txn.h" +#include "gtm/assert.h" +#include "gtm/stringinfo.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include "gtm/gtm_msg.h" +#include <unistd.h> + +/* Local functions */ +static XidStatus GlobalTransactionIdGetStatus(GlobalTransactionId transactionId); +static bool GTM_SetDoVacuum(GTM_TransactionHandle handle); + +GTM_Transactions GTMTransactions; + +void +GTM_InitTxnManager(void) +{ + int ii; + + memset(>MTransactions, 0, sizeof (GTM_Transactions)); + + for (ii = 0; ii < GTM_MAX_GLOBAL_TRANSACTIONS; ii++) + { + GTM_TransactionInfo *gtm_txninfo = >MTransactions.gt_transactions_array[ii]; + gtm_txninfo->gti_in_use = false; + GTM_RWLockInit(>m_txninfo->gti_lock); + } + + /* + * XXX When GTM is stopped and restarted, it must start assinging GXIDs + * greater than the previously assgined values. If it was a clean shutdown, + * the GTM can store the last assigned value at a known location on + * permanent storage and read it back when it's restarted. It will get + * trickier for GTM failures. + * + * TODO We skip thia part for the prototype. + */ + GTMTransactions.gt_nextXid = FirstNormalGlobalTransactionId; + + /* + * XXX The gt_oldestXid is the cluster level oldest Xid + */ + GTMTransactions.gt_oldestXid = FirstNormalGlobalTransactionId; + + /* + * XXX Compute various xid limits to avoid wrap-around related database + * corruptions. Again, this is not implemeneted for the prototype + */ + GTMTransactions.gt_xidVacLimit = InvalidGlobalTransactionId; + GTMTransactions.gt_xidWarnLimit = InvalidGlobalTransactionId; + GTMTransactions.gt_xidStopLimit = InvalidGlobalTransactionId; + GTMTransactions.gt_xidWrapLimit = InvalidGlobalTransactionId; + + /* + * XXX Newest XID that is committed or aborted + */ + GTMTransactions.gt_latestCompletedXid = FirstNormalGlobalTransactionId; + + /* + * Initialize the locks to protect various XID fields as well as the linked + * list of transactions + */ + GTM_RWLockInit(>MTransactions.gt_XidGenLock); + GTM_RWLockInit(>MTransactions.gt_TransArrayLock); + + /* + * Initialize the list + */ + GTMTransactions.gt_open_transactions = NIL; + GTMTransactions.gt_lastslot = -1; + + GTMTransactions.gt_gtm_state = GTM_STARTING; + + return; +} + +/* + * Get the status of current or past transaction. + */ +static XidStatus +GlobalTransactionIdGetStatus(GlobalTransactionId transactionId) +{ + XidStatus xidstatus; + + /* + * Also, check to see if the transaction ID is a permanent one. + */ + if (!GlobalTransactionIdIsNormal(transactionId)) + { + if (GlobalTransactionIdEquals(transactionId, BootstrapGlobalTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + if (GlobalTransactionIdEquals(transactionId, FrozenGlobalTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + return TRANSACTION_STATUS_ABORTED; + } + + /* + * TODO To be implemeneted + */ + return xidstatus; +} + +/* + * Given the GXID, find the corresponding transaction handle. + */ +GTM_TransactionHandle +GTM_GXIDToHandle(GlobalTransactionId gxid) +{ + ListCell *elem = NULL; + GTM_TransactionInfo *gtm_txninfo = NULL; + + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ); + + foreach(elem, GTMTransactions.gt_open_transactions) + { + gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem); + if (GlobalTransactionIdEquals(gtm_txninfo->gti_gxid, gxid)) + break; + gtm_txninfo = NULL; + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + + if (gtm_txninfo != NULL) + return gtm_txninfo->gti_handle; + else + return InvalidTransactionHandle; +} + +/* + * Given the transaction handle, find the corresponding transaction info + * structure + * + * Note: Since a transaction handle is just an index into the global array, + * this function should be very quick. We should turn into an inline future for + * fast path. + */ +GTM_TransactionInfo * +GTM_HandleToTransactionInfo(GTM_TransactionHandle handle) +{ + GTM_TransactionInfo *gtm_txninfo = NULL; + + if ((handle < 0) || (handle > GTM_MAX_GLOBAL_TRANSACTIONS)) + { + ereport(WARNING, + (ERANGE, errmsg("Invalid transaction handle: %d", handle))); + return NULL; + } + + gtm_txninfo = >MTransactions.gt_transactions_array[handle]; + + if (!gtm_txninfo->gti_in_use) + { + ereport(WARNING, + (ERANGE, errmsg("Invalid transaction handle, txn_info not in use"))); + return NULL; + } + + return gtm_txninfo; +} + +/* + * Remove the given transaction info structures from the global array. If the + * calling thread does not have enough cached structures, we in fact keep the + * structure in the global array and also add it to the list of cached + * structures for this thread. This ensures that the next transaction starting + * in this thread can quickly get a free slot in the array of transactions and + * also avoid repeated malloc/free of the structures. + * + * Also compute the latestCompletedXid. + */ +static void +GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count) +{ + int ii; + + /* + * Remove the transaction structure from the global list of open + * transactions + */ + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + + for (ii = 0; ii < txn_count; ii++) + { + if (gtm_txninfo[ii] == NULL) + continue; + + GTMTransactions.gt_open_transactions = list_delete(GTMTransactions.gt_open_transactions, gtm_txninfo[ii]); + + if (GlobalTransactionIdIsNormal(gtm_txninfo[ii]->gti_gxid) && + GlobalTransactionIdFollowsOrEquals(gtm_txninfo[ii]->gti_gxid, + GTMTransactions.gt_latestCompletedXid)) + GTMTransactions.gt_latestCompletedXid = gtm_txninfo[ii]->gti_gxid; + + + elog(DEBUG1, "GTM_RemoveTransInfoMulti: removing transaction id %u, %lu", + gtm_txninfo[ii]->gti_gxid, gtm_txninfo[ii]->gti_thread_id); + /* + * Now mark the transaction as aborted and mark the structure as not-in-use + */ + gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED; + gtm_txninfo[ii]->gti_nodecount = 0; + gtm_txninfo[ii]->gti_in_use = false; + gtm_txninfo[ii]->gti_snapshot_set = false; + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + return; +} + +/* + * Remove all transaction infos associated with the caller thread and the given + * backend + * + * Also compute the latestCompletedXid. + */ +void +GTM_RemoveAllTransInfos(int backend_id) +{ + ListCell *cell, *prev; + GTM_ThreadID thread_id; + + thread_id = pthread_self(); + + /* + * Scan the global list of open transactions + */ + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + prev = NULL; + cell = list_head(GTMTransactions.gt_open_transactions); + while (cell != NULL) + { + GTM_TransactionInfo *gtm_txninfo = lfirst(cell); + /* check if current entry is associated with the thread */ + if ((gtm_txninfo->gti_in_use) && + (gtm_txninfo->gti_thread_id == thread_id) && + ((gtm_txninfo->gti_backend_id == backend_id) || (backend_id == -1))) + { + /* remove the entry */ + GTMTransactions.gt_open_transactions = list_delete_cell(GTMTransactions.gt_open_transactions, cell, prev); + + /* update the latestComletedXid */ + if (GlobalTransactionIdIsNormal(gtm_txninfo->gti_gxid) && + GlobalTransactionIdFollowsOrEquals(gtm_txninfo->gti_gxid, + GTMTransactions.gt_latestCompletedXid)) + GTMTransactions.gt_latestCompletedXid = gtm_txninfo->gti_gxid; + + elog(DEBUG1, "GTM_RemoveAllTransInfos: removing transaction id %u, %lu:%lu", + gtm_txninfo->gti_gxid, gtm_txninfo->gti_thread_id, thread_id); + /* + * Now mark the transaction as aborted and mark the structure as not-in-use + */ + gtm_txninfo->gti_state = GTM_TXN_ABORTED; + gtm_txninfo->gti_nodecount = 0; + gtm_txninfo->gti_in_use = false; + gtm_txninfo->gti_snapshot_set = false; + + /* move to next cell in the list */ + if (prev) + cell = lnext(prev); + else + cell = list_head(GTMTransactions.gt_open_transactions); + } + else + { + prev = cell; + cell = lnext(cell); + } + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + return; +} +/* + * GlobalTransactionIdDidCommit + * True iff transaction associated with the identifier did commit. + * + * Note: + * Assumes transaction identifier is valid. + */ +bool /* true if given transaction committed */ +GlobalTransactionIdDidCommit(GlobalTransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = GlobalTransactionIdGetStatus(transactionId); + + /* + * If it's marked committed, it's committed. + */ + if (xidstatus == TRANSACTION_STATUS_COMMITTED) + return true; + + /* + * It's not committed. + */ + return false; +} + +/* + * GlobalTransactionIdDidAbort + * True iff transaction associated with the identifier did abort. + * + * Note: + * Assumes transaction identifier is valid. + */ +bool /* true if given transaction aborted */ +GlobalTransactionIdDidAbort(GlobalTransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = GlobalTransactionIdGetStatus(transactionId); + + /* + * If it's marked aborted, it's aborted. + */ + if (xidstatus == TRANSACTION_STATUS_ABORTED) + return true; + + /* + * It's not aborted. + */ + return false; +} + +/* + * GlobalTransactionIdPrecedes --- is id1 logically < id2? + */ +bool +GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2) +{ + /* + * If either ID is a permanent XID then we can just do unsigned + * comparison. If both are normal, do a modulo-2^31 comparison. + */ + int32 diff; + + if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2)) + return (id1 < id2); + + diff = (int32) (id1 - id2); + return (diff < 0); +} + +/* + * GlobalTransactionIdPrecedesOrEquals --- is id1 logically <= id2? + */ +bool +GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2) +{ + int32 diff; + + if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2)) + return (id1 <= id2); + + diff = (int32) (id1 - id2); + return (diff <= 0); +} + +/* + * GlobalTransactionIdFollows --- is id1 logically > id2? + */ +bool +GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2) +{ + int32 diff; + + if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2)) + return (id1 > id2); + + diff = (int32) (id1 - id2); + return (diff > 0); +} + +/* + * GlobalTransactionIdFollowsOrEquals --- is id1 logically >= id2? + */ +bool +GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2) +{ + int32 diff; + + if (!GlobalTransactionIdIsNormal(id1) || !GlobalTransactionIdIsNormal(id2)) + return (id1 >= id2); + + diff = (int32) (id1 - id2); + return (diff >= 0); +} + + +/* + * Set that the transaction is doing vacuum + * + */ +static bool +GTM_SetDoVacuum(GTM_TransactionHandle handle) +{ + GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(handle); + + if (gtm_txninfo == NULL) + ereport(ERROR, (EINVAL, errmsg("Invalid transaction handle"))); + + gtm_txninfo->gti_vacuum = true; + return true; +} + +/* + * Allocate the next XID for my new transaction + * + * The new XID is also stored into the transaction info structure of the given + * transaction before returning. + */ +GlobalTransactionId +GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count) +{ + GlobalTransactionId xid, start_xid = InvalidGlobalTransactionId; + GTM_TransactionInfo *gtm_txninfo = NULL; + int ii; + + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); + + if (GTMTransactions.gt_gtm_state == GTM_SHUTTING_DOWN) + { + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + ereport(ERROR, (EINVAL, errmsg("GTM shutting down -- can not issue new transaction ids"))); + return InvalidGlobalTransactionId; + } + + + /* + * If we are allocating the first XID of a new page of the commit log, + * zero out that commit-log page before returning. We must do this while + * holding XidGenLock, else another xact could acquire and commit a later + * XID before we zero the page. Fortunately, a page of the commit log + * holds 32K or more transactions, so we don't have to do this very often. + * + ExtendCLOG(xid); + */ + + /* + * Now advance the nextXid counter. This must not happen until after we + * have successfully completed ExtendCLOG() --- if that routine fails, we + * want the next incoming transaction to try it again. We cannot assign + * more XIDs until there is CLOG space for them. + */ + for (ii = 0; ii < txn_count; ii++) + { + xid = GTMTransactions.gt_nextXid; + + if (!GlobalTransactionIdIsValid(start_xid)) + start_xid = xid; + + /*---------- + * Check to see if it's safe to assign another XID. This protects against + * catastrophic data loss due to XID wraparound. The basic rules are: + * + * If we're past xidVacLimit, start trying to force autovacuum cycles. + * If we're past xidWarnLimit, start issuing warnings. + * If we're past xidStopLimit, refuse to execute transactions, unless + * we are running in a standalone backend (which gives an escape hatch + * to the DBA who somehow got past the earlier defenses). + * + * Test is coded to fall out as fast as possible during normal operation, + * ie, when the vac limit is set and we haven't violated it. + *---------- + */ + if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidVacLimit) && + GlobalTransactionIdIsValid(GTMTransactions.gt_xidVacLimit)) + { + if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidStopLimit)) + { + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + ereport(ERROR, + (ERANGE, + errmsg("database is not accepting commands to avoid wraparound data loss in database "))); + } + else if (GlobalTransactionIdFollowsOrEquals(xid, GTMTransactions.gt_xidWarnLimit)) + ereport(WARNING, + (errmsg("database must be vacuumed within %u transactions", + GTMTransactions.gt_xidWrapLimit - xid))); + } + + GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid); + gtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]); + Assert(gtm_txninfo); + gtm_txninfo->gti_gxid = xid; + } + + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + + return start_xid; +} + +/* + * Allocate the next XID for my new transaction + * + * The new XID is also stored into the transaction info structure of the given + * transaction before returning. + */ +GlobalTransactionId +GTM_GetGlobalTransactionId(GTM_TransactionHandle handle) +{ + return GTM_GetGlobalTransactionIdMulti(&handle, 1); +} + +/* + * Read nextXid but don't allocate it. + */ +GlobalTransactionId +ReadNewGlobalTransactionId(void) +{ + GlobalTransactionId xid; + + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_READ); + xid = GTMTransactions.gt_nextXid; + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + + return xid; +} + +/* + * Set the nextXid. + * + * The GXID is usually read from a control file and set when the GTM is + * started. When the GTM is finally shutdown, the next to-be-assigned GXID is + * stroed in the control file. + * + * XXX We don't yet handle any crash recovery. So if the GTM is shutdown + */ +void +SetNextGlobalTransactionId(GlobalTransactionId gxid) +{ + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); + GTMTransactions.gt_nextXid = gxid; + GTMTransactions.gt_gtm_state = GTM_RUNNING; + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + return; +} + + +/* Transaction Control */ +int +GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, + GTM_IsolationLevel isolevel[], + bool readonly[], + GTMProxy_ConnID connid[], + int txn_count, + GTM_TransactionHandle txns[]) +{ + GTM_TransactionInfo *gtm_txninfo[txn_count]; + MemoryContext oldContext; + int kk; + + memset(gtm_txninfo, 0, sizeof (gtm_txninfo)); + + /* + * XXX We should allocate the transaction info structure in the + * top-most memory context instead of a thread context. This is + * necessary because the transaction may outlive the thread which + * started the transaction. Also, since the structures are stored in + * the global array, it's dangerous to free the structures themselves + * without removing the corresponding references from the global array + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + for (kk = 0; kk < txn_count; kk++) + { + int ii, jj, startslot; + + /* + * We had no cached slots. Now find a free slot in the transation array + * and store the transaction info structure there + */ + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + + startslot = GTMTransactions.gt_lastslot + 1; + if (startslot >= GTM_MAX_GLOBAL_TRANSACTIONS) + startslot = 0; + + for (ii = startslot, jj = 0; + jj < GTM_MAX_GLOBAL_TRANSACTIONS; + ii = (ii + 1) % GTM_MAX_GLOBAL_TRANSACTIONS, jj++) + { + if (GTMTransactions.gt_transactions_array[ii].gti_in_use == false) + { + gtm_txninfo[kk] = >MTransactions.gt_transactions_array[ii]; + break; + } + + if (ii == GTMTransactions.gt_lastslot) + { + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + ereport(ERROR, + (ERANGE, errmsg("Max transaction limit reached"))); + } + } + + + gtm_txninfo[kk]->gti_gxid = InvalidGlobalTransactionId; + gtm_txninfo[kk]->gti_xmin = InvalidGlobalTransactionId; + gtm_txninfo[kk]->gti_state = GTM_TXN_STARTING; + gtm_txninfo[kk]->gti_coordid = coord_id; + + gtm_txninfo[kk]->gti_isolevel = isolevel[kk]; + gtm_txninfo[kk]->gti_readonly = readonly[kk]; + gtm_txninfo[kk]->gti_backend_id = connid[kk]; + gtm_txninfo[kk]->gti_in_use = true; + + gtm_txninfo[kk]->gti_handle = ii; + gtm_txninfo[kk]->gti_vacuum = false; + gtm_txninfo[kk]->gti_thread_id = pthread_self(); + GTMTransactions.gt_lastslot = ii; + + txns[kk] = ii; + + /* + * Add the structure to the global list of open transactions. We should + * call add the element to the list in the context of TopMostMemoryContext + * because the list is global and any memory allocation must outlive the + * thread context + */ + GTMTransactions.gt_open_transactions = lappend(GTMTransactions.gt_open_transactions, gtm_txninfo[kk]); + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + + MemoryContextSwitchTo(oldContext); + + return txn_count; +} + +/* Transaction Control */ +GTM_TransactionHandle +GTM_BeginTransaction(GTM_CoordinatorId coord_id, + GTM_IsolationLevel isolevel, + bool readonly) +{ + GTM_TransactionHandle txn; + GTMProxy_ConnID connid = -1; + + GTM_BeginTransactionMulti(coord_id, &isolevel, &readonly, &connid, 1, &txn); + return txn; +} + +/* + * Same as GTM_RollbackTransaction, but takes GXID as input + */ +int +GTM_RollbackTransactionGXID(GlobalTransactionId gxid) +{ + GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid); + return GTM_RollbackTransaction(txn); +} + +/* + * Rollback multiple transactions in one go + */ +int +GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]) +{ + GTM_TransactionInfo *gtm_txninfo[txn_count]; + int ii; + + for (ii = 0; ii < txn_count; ii++) + { + gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]); + + if (gtm_txninfo[ii] == NULL) + { + status[ii] = STATUS_ERROR; + continue; + } + + /* + * Mark the transaction as being aborted + */ + GTM_RWLockAcquire(>m_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE); + gtm_txninfo[ii]->gti_state = GTM_TXN_ABORT_IN_PROGRESS; + GTM_RWLockRelease(>m_txninfo[ii]->gti_lock); + status[ii] = STATUS_OK; + } + + GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count); + + return txn_count; +} + +/* + * Rollback a transaction + */ +int +GTM_RollbackTransaction(GTM_TransactionHandle txn) +{ + int status; + GTM_RollbackTransactionMulti(&txn, 1, &status); + return status; +} + + +/* + * Same as GTM_CommitTransaction but takes GXID as input + */ +int +GTM_CommitTransactionGXID(GlobalTransactionId gxid) +{ + GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid); + return GTM_CommitTransaction(txn); +} + +/* + * Commit multiple transactions in one go + */ +int +GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]) +{ + GTM_TransactionInfo *gtm_txninfo[txn_count]; + int ii; + + for (ii = 0; ii < txn_count; ii++) + { + gtm_txninfo[ii] = GTM_HandleToTransactionInfo(txn[ii]); + + if (gtm_txninfo[ii] == NULL) + { + status[ii] = STATUS_ERROR; + continue; + } + /* + * Mark the transaction as being aborted + */ + GTM_RWLockAcquire(>m_txninfo[ii]->gti_lock, GTM_LOCKMODE_WRITE); + gtm_txninfo[ii]->gti_state = GTM_TXN_COMMIT_IN_PROGRESS; + GTM_RWLockRelease(>m_txninfo[ii]->gti_lock); + status[ii] = STATUS_OK; + } + + GTM_RemoveTransInfoMulti(gtm_txninfo, txn_count); + + return txn_count; +} + +/* + * Commit a transaction + */ +int +GTM_CommitTransaction(GTM_TransactionHandle txn) +{ + int status; + GTM_CommitTransactionMulti(&txn, 1, &status); + return status; +} + +/* + * Prepare a transaction + */ +int +GTM_PrepareTransaction(GTM_TransactionHandle txn, + uint32 nodecnt, + PGXC_NodeId nodes[]) +{ + GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn); + + if (gtm_txninfo == NULL) + return STATUS_ERROR; + + /* + * Mark the transaction as being aborted + */ + GTM_RWLockAcquire(>m_txninfo->gti_lock, GTM_LOCKMODE_WRITE); + + gtm_txninfo->gti_state = GTM_TXN_PREPARE_IN_PROGRESS; + gtm_txninfo->gti_nodecount = nodecnt; + if (gtm_txninfo->gti_nodes == NULL) + gtm_txninfo->gti_nodes = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES); + memcpy(gtm_txninfo->gti_nodes, nodes, sizeof (PGXC_NodeId) * nodecnt); + + GTM_RWLockRelease(>m_txninfo->gti_lock); + + return STATUS_OK; +} + +/* + * Same as GTM_PrepareTransaction but takes GXID as input + */ +int +GTM_PrepareTransactionGXID(GlobalTransactionId gxid, + uint32 nodecnt, + PGXC_NodeId nodes[]) +{ + GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid); + return GTM_PrepareTransaction(txn, nodecnt, nodes); +} + +/* + * Get status of the given transaction + */ +GTM_TransactionStates +GTM_GetStatus(GTM_TransactionHandle txn) +{ + GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn); + return gtm_txninfo->gti_state; +} + +/* + * Same as GTM_GetStatus but takes GXID as input + */ +GTM_TransactionStates +GTM_GetStatusGXID(GlobalTransactionId gxid) +{ + GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid); + return GTM_GetStatus(txn); +} + +/* + * Process MSG_TXN_BEGIN message + */ +void +ProcessBeginTransactionCommand(Port *myport, StringInfo message) +{ + GTM_IsolationLevel txn_isolation_level; + bool txn_read_only; + StringInfoData buf; + GTM_TransactionHandle txn; + MemoryContext oldContext; + + txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + txn_read_only = pq_getmsgbyte(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Start a new transaction + * + * XXX Port should contain Coordinator Id - replace 0 with that + */ + txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only); + if (txn == InvalidTransactionHandle) + ereport(ERROR, + (EINVAL, + errmsg("Failed to start a new transaction"))); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEGIN_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn, sizeof(txn)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_BEGIN_GETGXID message + */ +void +ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message) +{ + GTM_IsolationLevel txn_isolation_level; + bool txn_read_only; + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + MemoryContext oldContext; + + txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + txn_read_only = pq_getmsgbyte(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Start a new transaction + * + * XXX Port should contain Coordinator Id - replace 0 with that + */ + txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only); + if (txn == InvalidTransactionHandle) + ereport(ERROR, + (EINVAL, + errmsg("Failed to start a new transaction"))); + + gxid = GTM_GetGlobalTransactionId(txn); + if (gxid == InvalidGlobalTransactionId) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a new transaction id"))); + + MemoryContextSwitchTo(oldContext); + + elog(LOG, "Sending transaction id %u", gxid); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_BEGIN_GETGXID_AUTOVACUUM message + */ +void +ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message) +{ + GTM_IsolationLevel txn_isolation_level; + bool txn_read_only; + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + MemoryContext oldContext; + + elog(DEBUG3, "Inside ProcessBeginTransactionGetGXIDAutovacuumCommand"); + + txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + txn_read_only = pq_getmsgbyte(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Start a new transaction + * + * XXX Port should contain Coordinator Id - replace 0 with that + */ + txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only); + if (txn == InvalidTransactionHandle) + ereport(ERROR, + (EINVAL, + errmsg("Failed to start a new transaction"))); + + gxid = GTM_GetGlobalTransactionId(txn); + if (gxid == InvalidGlobalTransactionId) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a new transaction id"))); + + /* Indicate that it is for autovacuum */ + GTM_SetDoVacuum(txn); + + MemoryContextSwitchTo(oldContext); + + elog(DEBUG3, "Sending transaction id %d", gxid); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_BEGIN_GETGXID_MULTI message + */ +void +ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) +{ + GTM_IsolationLevel txn_isolation_level[GTM_MAX_GLOBAL_TRANSACTIONS]; + bool txn_read_only[GTM_MAX_GLOBAL_TRANSACTIONS]; + int txn_count; + StringInfoData buf; + GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS]; + GlobalTransactionId gxid, end_gxid; + GTMProxy_ConnID txn_connid[GTM_MAX_GLOBAL_TRANSACTIONS]; + MemoryContext oldContext; + int count; + int ii; + + txn_count = pq_getmsgint(message, sizeof (int)); + + if (txn_count <= 0) + elog(PANIC, "Zero or less transaction count"); + + for (ii = 0; ii < txn_count; ii++) + { + txn_isolation_level[ii] = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + txn_read_only[ii] = pq_getmsgbyte(message); + txn_connid[ii] = pq_getmsgint(message, sizeof (GTMProxy_ConnID)); + } + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Start a new transaction + * + * XXX Port should contain Coordinator Id - replace 0 with that + */ + count = GTM_BeginTransactionMulti(0, txn_isolation_level, txn_read_only, txn_connid, + txn_count, txn); + if (count != txn_count) + ereport(ERROR, + (EINVAL, + errmsg("Failed to start %d new transactions", txn_count))); + + gxid = GTM_GetGlobalTransactionIdMulti(txn, txn_count); + if (gxid == InvalidGlobalTransactionId) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a new transaction id"))); + + MemoryContextSwitchTo(oldContext); + + end_gxid = gxid + txn_count; + if (end_gxid < gxid) + end_gxid += FirstNormalGlobalTransactionId; + + elog(LOG, "Sending transaction ids from %u to %u", gxid, end_gxid); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEGIN_GETGXID_MULTI_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_COMMIT message + */ +void +ProcessCommitTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + int isgxid = 0; + MemoryContext oldContext; + int status = STATUS_OK; + + isgxid = pq_getmsgbyte(message); + + if (isgxid) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + txn = GTM_GXIDToHandle(gxid); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Commit the transaction + */ + status = GTM_CommitTransaction(txn); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_COMMIT_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_sendint(&buf, status, sizeof(status)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_ROLLBACK message + */ +void +ProcessRollbackTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + int isgxid = 0; + MemoryContext oldContext; + int status = STATUS_OK; + + isgxid = pq_getmsgbyte(message); + + if (isgxid) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + txn = GTM_GXIDToHandle(gxid); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Commit the transaction + */ + status = GTM_RollbackTransaction(txn); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_sendint(&buf, status, sizeof(status)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + + +/* + * Process MSG_TXN_COMMIT_MULTI message + */ +void +ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS]; + GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + MemoryContext oldContext; + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + int txn_count, count; + int ii; + + txn_count = pq_getmsgint(message, sizeof (int)); + + for (ii = 0; ii < txn_count; ii++) + { + isgxid[ii] = pq_getmsgbyte(message); + if (isgxid[ii]) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid[ii], data, sizeof (gxid[ii])); + txn[ii] = GTM_GXIDToHandle(gxid[ii]); + elog(DEBUG1, "ProcessCommitTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn[ii], data, sizeof (txn[ii])); + elog(DEBUG1, "ProcessCommitTransactionCommandMulti: handle(%u)", txn[ii]); + } + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Commit the transaction + */ + count = GTM_CommitTransactionMulti(txn, txn_count, status); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_COMMIT_MULTI_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); + pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_ROLLBACK_MULTI message + */ +void +ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn[GTM_MAX_GLOBAL_TRANSACTIONS]; + GlobalTransactionId gxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + int isgxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + MemoryContext oldContext; + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + int txn_count, count; + int ii; + + txn_count = pq_getmsgint(message, sizeof (int)); + + for (ii = 0; ii < txn_count; ii++) + { + isgxid[ii] = pq_getmsgbyte(message); + if (isgxid[ii]) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid[ii], data, sizeof (gxid[ii])); + txn[ii] = GTM_GXIDToHandle(gxid[ii]); + elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn[ii], data, sizeof (txn[ii])); + elog(DEBUG1, "ProcessRollbackTransactionCommandMulti: handle(%u)", txn[ii]); + } + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Commit the transaction + */ + count = GTM_RollbackTransactionMulti(txn, txn_count, status); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_ROLLBACK_MULTI_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count)); + pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_PREPARE message + */ +void +ProcessPrepareTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + int isgxid = 0; + int nodecnt; + PGXC_NodeId *nodes; + MemoryContext oldContext; + + isgxid = pq_getmsgbyte(message); + + if (isgxid) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + txn = GTM_GXIDToHandle(gxid); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + } + + nodecnt = pq_getmsgint(message, sizeof (nodecnt)); + nodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * nodecnt); + memcpy(nodes, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * nodecnt), + sizeof (PGXC_NodeId) * nodecnt); + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Prepare the transaction + */ + if (GTM_PrepareTransaction(txn, nodecnt, nodes) != STATUS_OK) + ereport(ERROR, + (EINVAL, + errmsg("Failed to commit the transaction"))); + + MemoryContextSwitchTo(oldContext); + + pfree(nodes); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_PREPARE_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_GET_GXID message + */ +void +ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + const char *data; + MemoryContext oldContext; + + elog(DEBUG3, "Inside ProcessGetGXIDTransactionCommand"); + + data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Get the transaction id for the given global transaction + */ + gxid = GTM_GetGlobalTransactionId(txn); + if (GlobalTransactionIdIsValid(gxid)) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get the transaction id"))); + + MemoryContextSwitchTo(oldContext); + + elog(DEBUG3, "Sending transaction id %d", gxid); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_GET_GXID_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&txn, sizeof(txn)); + pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + + +/* + * Mark GTM as shutting down. This point onwards no new GXID are issued to + * ensure that the last GXID recorded in the control file remains sane + */ +void +GTM_SetShuttingDown(void) +{ + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); + GTMTransactions.gt_gtm_state = GTM_SHUTTING_DOWN; + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); +} + +void +GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid) +{ + GlobalTransactionId saved_gxid; + + if (ctlfd != -1) + { + if ((read(ctlfd, &saved_gxid, sizeof (saved_gxid)) != sizeof (saved_gxid)) && + (!GlobalTransactionIdIsValid(next_gxid))) + return; + if (!GlobalTransactionIdIsValid(next_gxid)) + next_gxid = saved_gxid; + } + + elog(LOG, "Restoring last GXID to %u\n", next_gxid); + + if (GlobalTransactionIdIsValid(next_gxid)) + SetNextGlobalTransactionId(next_gxid); + /* Set this otherwise a strange snapshot might be returned for the first one */ + GTMTransactions.gt_latestCompletedXid = next_gxid - 1; + return; +} + +void +GTM_SaveTxnInfo(int ctlfd) +{ + GlobalTransactionId next_gxid; + + next_gxid = ReadNewGlobalTransactionId(); + + elog(LOG, "Saving transaction info - next_gxid: %u", next_gxid); + + write(ctlfd, &next_gxid, sizeof (next_gxid)); +} +/* + * TODO + */ +int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt); + +/* + * TODO + */ +uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt); + diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c new file mode 100644 index 0000000000..0ef09c436a --- /dev/null +++ b/src/gtm/main/main.c @@ -0,0 +1,1370 @@ +/*------------------------------------------------------------------------- + * + * main.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include <time.h> +#include <unistd.h> +#include <signal.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <getopt.h> +#include <stdio.h> + +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/elog.h" +#include "gtm/memutils.h" +#include "gtm/gtm_list.h" +#include "gtm/libpq.h" +#include "gtm/libpq-be.h" +#include "gtm/pqsignal.h" +#include "gtm/pqformat.h" +#include "gtm/assert.h" +#include "gtm/gtm_txn.h" +#include "gtm/gtm_seq.h" +#include "gtm/gtm_msg.h" + +extern int optind; +extern char *optarg; + +#define GTM_MAX_PATH 1024 +#define GTM_DEFAULT_HOSTNAME "*" +#define GTM_DEFAULT_PORT 6666 +#define GTM_CONTROL_FILE "gtm.control" +#define GTM_PID_FILE "gtm.pid" +#define GTM_LOG_FILE "gtm.log" + +static char *progname = "gtm"; +char *ListenAddresses; +int GTMPortNumber; +char GTMControlFile[GTM_MAX_PATH]; +char *GTMDataDir; + +/* The socket(s) we're listening to. */ +#define MAXLISTEN 64 +static int ListenSocket[MAXLISTEN]; + +pthread_key_t threadinfo_key; +static bool GTMAbortPending = false; + +static Port *ConnCreate(int serverFd); +static int ServerLoop(void); +static int initMasks(fd_set *rmask); +void *GTM_ThreadMain(void *argp); +static int GTMAddConnection(Port *port); +static int ReadCommand(Port *myport, StringInfo inBuf); + +static void ProcessCommand(Port *myport, StringInfo input_message); +static void ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message); +static void ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message); +static void ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message); +static void ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message); +static void ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message); + +static void GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id); +static void GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id); + +static bool CreateOptsFile(int argc, char *argv[]); +static void CreateDataDirLockFile(void); +static void CreateLockFile(const char *filename, const char *refName); +static void ChangeToDataDir(void); +static void checkDataDir(void); +static void DeleteLockFile(const char *filename); + +/* + * One-time initialization. It's called immediately after the main process + * starts + */ +static GTM_ThreadInfo * +MainThreadInit() +{ + GTM_ThreadInfo *thrinfo; + + pthread_key_create(&threadinfo_key, NULL); + + /* + * Initialize the lock protecting the global threads info + */ + GTM_RWLockInit(>MThreads->gt_lock); + + /* + * We are called even before memory context management is setup. We must + * use malloc + */ + thrinfo = (GTM_ThreadInfo *)malloc(sizeof (GTM_ThreadInfo)); + + if (thrinfo == NULL) + { + fprintf(stderr, "malloc failed: %d", errno); + fflush(stdout); + fflush(stderr); + } + + if (SetMyThreadInfo(thrinfo)) + { + fprintf(stderr, "SetMyThreadInfo failed: %d", errno); + fflush(stdout); + fflush(stderr); + } + + return thrinfo; +} + +static void +BaseInit() +{ + GTM_ThreadInfo *thrinfo; + + thrinfo = MainThreadInit(); + + MyThreadID = pthread_self(); + + MemoryContextInit(); + + checkDataDir(); + ChangeToDataDir(); + CreateDataDirLockFile(); + + sprintf(GTMControlFile, "%s/%s", GTMDataDir, GTM_CONTROL_FILE); + if (GTMLogFile == NULL) + { + GTMLogFile = (char *) malloc(GTM_MAX_PATH); + sprintf(GTMLogFile, "%s/%s", GTMDataDir, GTM_LOG_FILE); + } + + DebugFileOpen(); + + GTM_InitTxnManager(); + GTM_InitSeqManager(); + + /* + * The memory context is now set up. + * Add the thrinfo structure in the global array + */ + if (GTM_ThreadAdd(thrinfo) == -1) + { + fprintf(stderr, "GTM_ThreadAdd for main thread failed: %d", errno); + fflush(stdout); + fflush(stderr); + } +} + +static void +GTM_SigleHandler(int signal) +{ + fprintf(stderr, "Received signal %d", signal); + + switch (signal) + { + case SIGKILL: + case SIGTERM: + case SIGQUIT: + case SIGINT: + case SIGHUP: + break; + + default: + fprintf(stderr, "Unknown signal %d\n", signal); + return; + } + + /* + * XXX We should do a clean shutdown here. + */ + /* Delete pid file before shutting down */ + DeleteLockFile(GTM_PID_FILE); + + PG_SETMASK(&BlockSig); + GTMAbortPending = true; + + return; +} + +/* + * Help display should match + */ +static void +help(const char *progname) +{ + printf(_("This is the GTM server.\n\n")); + printf(_("Usage:\n %s [OPTION]...\n\n"), progname); + printf(_("Options:\n")); + printf(_(" -h hostname GTM server hostname/IP\n")); + printf(_(" -p port GTM server port number\n")); + printf(_(" -x xid Starting GXID \n")); + printf(_(" -D directory GTM working directory\n")); + printf(_(" -l filename GTM server log file name \n")); + printf(_(" --help show this help, then exit\n")); +} + +int +main(int argc, char *argv[]) +{ + int opt; + int status; + int i; + GlobalTransactionId next_gxid = InvalidGlobalTransactionId; + int ctlfd; + + /* + * Catch standard options before doing much else + */ + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + help(argv[0]); + exit(0); + } + } + + ListenAddresses = GTM_DEFAULT_HOSTNAME; + GTMPortNumber = GTM_DEFAULT_PORT; + + /* + * Parse the command like options and set variables + */ + while ((opt = getopt(argc, argv, "h:p:x:D:l:")) != -1) + { + switch (opt) + { + case 'h': + ListenAddresses = strdup(optarg); + break; + + case 'p': + GTMPortNumber = atoi(optarg); + break; + + case 'x': + next_gxid = (GlobalTransactionId )atoll(optarg); + break; + + case 'D': + GTMDataDir = strdup(optarg); + canonicalize_path(GTMDataDir); + break; + + case 'l': + GTMLogFile = strdup(optarg); + break; + + default: + write_stderr("Try \"%s --help\" for more information.\n", + progname); + } + } + + if (GTMDataDir == NULL) + { + write_stderr("GTM data directory must be specified\n"); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + exit(1); + } + /* + * GTM accepts no non-option switch arguments. + */ + if (optind < argc) + { + write_stderr("%s: invalid argument: \"%s\"\n", + progname, argv[optind]); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + exit(1); + } + + /* + * Some basic initialization must happen before we do anything + * useful + */ + BaseInit(); + + elog(DEBUG3, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile); + + /* + * Read the last GXID and start from there + */ + + ctlfd = open(GTMControlFile, O_RDONLY); + + GTM_RestoreTxnInfo(ctlfd, next_gxid); + GTM_RestoreSeqInfo(ctlfd); + + close(ctlfd); + /* + * Establish input sockets. + */ + for (i = 0; i < MAXLISTEN; i++) + ListenSocket[i] = -1; + + if (ListenAddresses) + { + int success = 0; + + status = StreamServerPort(AF_UNSPEC, ListenAddresses, + (unsigned short) GTMPortNumber, + ListenSocket, MAXLISTEN); + if (status == STATUS_OK) + success++; + else + ereport(FATAL, + (errmsg("could not create listen socket for \"%s\"", + ListenAddresses))); + } + + /* + * check that we have some socket to listen on + */ + if (ListenSocket[0] == -1) + ereport(FATAL, + (errmsg("no socket created for listening"))); + + /* + * Record gtm options. We delay this till now to avoid recording + * bogus options + */ + if (!CreateOptsFile(argc, argv)) + exit(1); + + pqsignal(SIGHUP, GTM_SigleHandler); + pqsignal(SIGKILL, GTM_SigleHandler); + pqsignal(SIGQUIT, GTM_SigleHandler); + pqsignal(SIGTERM, GTM_SigleHandler); + pqsignal(SIGINT, GTM_SigleHandler); + + pqinitmask(); + + /* + * Accept any new connections. Fork a new thread for each incoming + * connection + */ + status = ServerLoop(); + + /* + * ServerLoop probably shouldn't ever return, but if it does, close down. + */ + exit(status != STATUS_OK); + + return 0; /* not reached */ +} + +/* + * ConnCreate -- create a local connection data structure + */ +static Port * +ConnCreate(int serverFd) +{ + Port *port; + + if (!(port = (Port *) calloc(1, sizeof(Port)))) + { + ereport(LOG, + (ENOMEM, + errmsg("out of memory"))); + exit(1); + } + + if (StreamConnection(serverFd, port) != STATUS_OK) + { + if (port->sock >= 0) + StreamClose(port->sock); + ConnFree(port); + port = NULL; + } + + port->conn_id = InvalidGTMProxyConnID; + return port; +} + +/* + * ConnFree -- free a local connection data structure + */ +void +ConnFree(Port *conn) +{ + free(conn); +} + +/* + * Main idle loop of postmaster + */ +static int +ServerLoop(void) +{ + fd_set readmask; + int nSockets; + + nSockets = initMasks(&readmask); + + for (;;) + { + fd_set rmask; + int selres; + + //MemoryContextStats(TopMostMemoryContext); + + /* + * Wait for a connection request to arrive. + * + * We wait at most one minute, to ensure that the other background + * tasks handled below get done even when no requests are arriving. + */ + memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set)); + + PG_SETMASK(&UnBlockSig); + + if (GTMAbortPending) + { + int ctlfd; + + /* + * XXX We should do a clean shutdown here. For the time being, just + * write the next GXID to be issued in the control file and exit + * gracefully + */ + + /* + * Tell GTM that we are shutting down so that no new GXIDs are + * issued this point onwards + */ + GTM_SetShuttingDown(); + + ctlfd = open(GTMControlFile, O_WRONLY | O_TRUNC | O_CREAT, + S_IRUSR | S_IWUSR); + if (ctlfd == -1) + { + fprintf(stderr, "Failed to create/open the control file\n"); + exit(2); + } + + GTM_SaveTxnInfo(ctlfd); + GTM_SaveSeqInfo(ctlfd); + + close(ctlfd); + + exit(1); + } + + { + /* must set timeout each time; some OSes change it! */ + struct timeval timeout; + + timeout.tv_sec = 60; + timeout.tv_usec = 0; + + selres = select(nSockets, &rmask, NULL, NULL, &timeout); + } + + /* + * Block all signals until we wait again. (This makes it safe for our + * signal handlers to do nontrivial work.) + */ + PG_SETMASK(&BlockSig); + + /* Now check the select() result */ + if (selres < 0) + { + if (errno != EINTR && errno != EWOULDBLOCK) + { + ereport(LOG, + (EACCES, + errmsg("select() failed in postmaster: %m"))); + return STATUS_ERROR; + } + } + + /* + * New connection pending on any of our sockets? If so, fork a child + * process to deal with it. + */ + if (selres > 0) + { + int i; + + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] == -1) + break; + if (FD_ISSET(ListenSocket[i], &rmask)) + { + Port *port; + + port = ConnCreate(ListenSocket[i]); + if (port) + { + if (GTMAddConnection(port) != STATUS_OK) + { + elog(ERROR, "Too many connections"); + StreamClose(port->sock); + ConnFree(port); + } + } + } + } + } + } +} + +/* + * Initialise the masks for select() for the ports we are listening on. + * Return the number of sockets to listen on. + */ +static int +initMasks(fd_set *rmask) +{ + int maxsock = -1; + int i; + + FD_ZERO(rmask); + + for (i = 0; i < MAXLISTEN; i++) + { + int fd = ListenSocket[i]; + + if (fd == -1) + break; + FD_SET(fd, rmask); + if (fd > maxsock) + maxsock = fd; + } + + return maxsock + 1; +} + + +void * +GTM_ThreadMain(void *argp) +{ + GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; + int qtype; + StringInfoData input_message; + sigjmp_buf local_sigjmp_buf; + + elog(DEBUG3, "Starting the connection helper thread"); + + + /* + * Create the memory context we will use in the main loop. + * + * MessageContext is reset once per iteration of the main loop, ie, upon + * completion of processing of each command message from the client. + * + * This context is thread-specific + */ + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + + { + /* + * We expect a startup message at the very start. The message type is + * REGISTER_COORD, followed by the 4 byte coordinator ID + */ + char startup_type; + GTM_StartupPacket sp; + StringInfoData inBuf; + + startup_type = pq_getbyte(thrinfo->thr_conn->con_port); + + if (startup_type != 'A') + ereport(ERROR, + (EPROTO, + errmsg("Expecting a startup message, but received %c", + startup_type))); + + initStringInfo(&inBuf); + + /* + * All frontend messages have a length word next + * after the type code; we can read the message contents independently of + * the type. + */ + if (pq_getmessage(thrinfo->thr_conn->con_port, &inBuf, 0)) + ereport(ERROR, + (EPROTO, + errmsg("Expecting coordinator ID, but received EOF"))); + + memcpy(&sp, + pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)), + sizeof (GTM_StartupPacket)); + pq_getmsgend(&inBuf); + + GTM_RegisterCoordinator(thrinfo->thr_conn->con_port, sp.sp_cid); + thrinfo->thr_conn->con_port->is_proxy = sp.sp_isproxy; + } + + { + /* + * Send a dummy authentication request message 'R' as the client + * expects that in the current protocol + */ + StringInfoData buf; + pq_beginmessage(&buf, 'R'); + pq_endmessage(thrinfo->thr_conn->con_port, &buf); + pq_flush(thrinfo->thr_conn->con_port); + + elog(DEBUG3, "Sent connection authentication message to the client"); + } + + /* + * Get the input_message in the TopMemoryContext so that we don't need to + * free/palloc it for every incoming message. Unlike Postgres, we don't + * expect the incoming messages to be of arbitrary sizes + */ + + initStringInfo(&input_message); + + /* + * POSTGRES main processing loop begins here + * + * If an exception is encountered, processing resumes here so we abort the + * current transaction and start a new one. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + */ + + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* + * NOTE: if you are tempted to add more code in this if-block, + * consider the high probability that it should be in + * AbortTransaction() instead. The only stuff done directly here + * should be stuff that is guaranteed to apply *only* for outer-level + * error recovery, such as adjusting the FE/BE protocol status. + */ + + /* Report the error to the client and/or server log */ + if (thrinfo->thr_conn) + EmitErrorReport(thrinfo->thr_conn->con_port); + else + EmitErrorReport(NULL); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(TopMemoryContext); + FlushErrorState(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + + for (;;) + { + /* + * Release storage left over from prior query cycle, and create a new + * query input buffer in the cleared MessageContext. + */ + MemoryContextSwitchTo(MessageContext); + MemoryContextResetAndDeleteChildren(MessageContext); + + /* + * Just reset the input buffer to avoid repeated palloc/pfrees + * + * XXX We should consider resetting the MessageContext periodically to + * handle any memory leaks + */ + resetStringInfo(&input_message); + + /* + * (3) read a command (loop blocks here) + */ + qtype = ReadCommand(thrinfo->thr_conn->con_port, &input_message); + + switch(qtype) + { + case 'C': + ProcessCommand(thrinfo->thr_conn->con_port, &input_message); + break; + + case 'X': + case EOF: + /* + * Connection termination request + * Remove all transactions opened within the thread + */ + GTM_RemoveAllTransInfos(-1); + pthread_exit(thrinfo); + break; + + case 'F': + /* + * Flush all the outgoing data on the wire. Consume the message + * type field for sanity + */ + pq_getmsgint(&input_message, sizeof (GTM_MessageType)); + pq_getmsgend(&input_message); + pq_flush(thrinfo->thr_conn->con_port); + break; + + default: + /* + * Remove all transactions opened within the thread + */ + GTM_RemoveAllTransInfos(-1); + + ereport(FATAL, + (EPROTO, + errmsg("invalid frontend message type %d", + qtype))); + break; + } + + } + + /* can't get here because the above loop never exits */ + Assert(false); + + return thrinfo; +} + +void +ProcessCommand(Port *myport, StringInfo input_message) +{ + GTM_MessageType mtype; + GTM_ProxyMsgHeader proxyhdr; + + if (myport->is_proxy) + pq_copymsgbytes(input_message, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + else + proxyhdr.ph_conid = InvalidGTMProxyConnID; + + myport->conn_id = proxyhdr.ph_conid; + mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType)); + + switch (mtype) + { + case MSG_UNREGISTER_COORD: + ProcessCoordinatorCommand(myport, mtype, input_message); + break; + + case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN_GETGXID: + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + case MSG_TXN_PREPARE: + case MSG_TXN_COMMIT: + case MSG_TXN_ROLLBACK: + case MSG_TXN_GET_GXID: + case MSG_TXN_BEGIN_GETGXID_MULTI: + case MSG_TXN_COMMIT_MULTI: + case MSG_TXN_ROLLBACK_MULTI: + ProcessTransactionCommand(myport, mtype, input_message); + break; + + case MSG_SNAPSHOT_GET: + case MSG_SNAPSHOT_GXID_GET: + case MSG_SNAPSHOT_GET_MULTI: + ProcessSnapshotCommand(myport, mtype, input_message); + break; + + case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: + case MSG_SEQUENCE_GET_NEXT: + case MSG_SEQUENCE_RESET: + case MSG_SEQUENCE_CLOSE: + ProcessSeqeunceCommand(myport, mtype, input_message); + break; + + case MSG_TXN_GET_STATUS: + case MSG_TXN_GET_ALL_PREPARED: + ProcessQueryCommand(myport, mtype, input_message); + break; + + case MSG_BACKEND_DISCONNECT: + GTM_RemoveAllTransInfos(proxyhdr.ph_conid); + break; + + default: + ereport(FATAL, + (EPROTO, + errmsg("invalid frontend message type %d", + mtype))); + } +} + +static int +GTMAddConnection(Port *port) +{ + GTM_ConnectionInfo *conninfo = NULL; + + conninfo = (GTM_ConnectionInfo *)palloc(sizeof (GTM_ConnectionInfo)); + + if (conninfo == NULL) + { + ereport(ERROR, + (ENOMEM, + errmsg("Out of memory"))); + return STATUS_ERROR; + } + + elog(DEBUG3, "Started new connection"); + conninfo->con_port = port; + + /* + * XXX Start the thread + */ + if (GTM_ThreadCreate(conninfo, GTM_ThreadMain) == NULL) + { + elog(ERROR, "failed to create a new thread"); + return STATUS_ERROR; + } + + return STATUS_OK; +} + +/* ---------------- + * ReadCommand reads a command from either the frontend or + * standard input, places it in inBuf, and returns the + * message type code (first byte of the message). + * EOF is returned if end of file. + * ---------------- + */ +static int +ReadCommand(Port *myport, StringInfo inBuf) +{ + int qtype; + + /* + * Get message type code from the frontend. + */ + qtype = pq_getbyte(myport); + + if (qtype == EOF) /* frontend disconnected */ + { + ereport(COMMERROR, + (EPROTO, + errmsg("unexpected EOF on client connection"))); + return EOF; + } + + /* + * Validate message type code before trying to read body; if we have lost + * sync, better to say "command unknown" than to run out of memory because + * we used garbage as a length word. + * + * This also gives us a place to set the doing_extended_query_message flag + * as soon as possible. + */ + switch (qtype) + { + case 'C': + break; + + case 'X': + break; + + case 'F': + break; + + default: + + /* + * Otherwise we got garbage from the frontend. We treat this as + * fatal because we have probably lost message boundary sync, and + * there's no good way to recover. + */ + ereport(ERROR, + (EPROTO, + errmsg("invalid frontend message type %d", qtype))); + + break; + } + + /* + * In protocol version 3, all frontend messages have a length word next + * after the type code; we can read the message contents independently of + * the type. + */ + if (pq_getmessage(myport, inBuf, 0)) + return EOF; /* suitable message already logged */ + + return qtype; +} + +static void +ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message) +{ + GTM_CoordinatorId cid; + + cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId)); + + switch (mtype) + { + case MSG_UNREGISTER_COORD: + GTM_UnregisterCoordinator(myport, cid); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + pq_getmsgend(message); +} + +static void +ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo message) +{ + elog(DEBUG1, "ProcessTransactionCommand: mtype:%d", mtype); + + switch (mtype) + { + case MSG_TXN_BEGIN: + ProcessBeginTransactionCommand(myport, message); + break; + + case MSG_TXN_BEGIN_GETGXID: + ProcessBeginTransactionGetGXIDCommand(myport, message); + break; + + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message); + break; + + case MSG_TXN_BEGIN_GETGXID_MULTI: + ProcessBeginTransactionGetGXIDCommandMulti(myport, message); + break; + + case MSG_TXN_PREPARE: + ProcessPrepareTransactionCommand(myport, message); + break; + + case MSG_TXN_COMMIT: + ProcessCommitTransactionCommand(myport, message); + break; + + case MSG_TXN_ROLLBACK: + ProcessRollbackTransactionCommand(myport, message); + break; + + case MSG_TXN_COMMIT_MULTI: + ProcessCommitTransactionCommandMulti(myport, message); + break; + + case MSG_TXN_ROLLBACK_MULTI: + ProcessRollbackTransactionCommandMulti(myport, message); + break; + + case MSG_TXN_GET_GXID: + ProcessGetGXIDTransactionCommand(myport, message); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } +} + +static void +ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringInfo message) +{ + switch (mtype) + { + case MSG_SNAPSHOT_GET: + ProcessGetSnapshotCommand(myport, message, false); + break; + + case MSG_SNAPSHOT_GET_MULTI: + ProcessGetSnapshotCommandMulti(myport, message); + break; + + case MSG_SNAPSHOT_GXID_GET: + ProcessGetSnapshotCommand(myport, message, true); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + +} + +static void +ProcessSeqeunceCommand(Port *myport, GTM_MessageType mtype, StringInfo message) +{ + switch (mtype) + { + case MSG_SEQUENCE_INIT: + ProcessSequenceInitCommand(myport, message); + break; + + case MSG_SEQUENCE_GET_CURRENT: + ProcessSequenceGetCurrentCommand(myport, message); + break; + + case MSG_SEQUENCE_GET_NEXT: + ProcessSequenceGetNextCommand(myport, message); + break; + + case MSG_SEQUENCE_RESET: + ProcessSequenceResetCommand(myport, message); + break; + + case MSG_SEQUENCE_CLOSE: + ProcessSequenceCloseCommand(myport, message); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + +} + +static void +ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message) +{ + switch (mtype) + { + case MSG_TXN_GET_STATUS: + case MSG_TXN_GET_ALL_PREPARED: + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + +} + +static void +GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId cid) +{ + elog(DEBUG3, "Registering coordinator with cid %d", cid); + myport->coordinator_id = cid; +} + + +static void +GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId cid) +{ + /* + * Do a clean shutdown + */ + return; +} + +/* + * Validate the proposed data directory + */ +static void +checkDataDir(void) +{ + struct stat stat_buf; + + Assert(GTMDataDir); + +retry: + if (stat(GTMDataDir, &stat_buf) != 0) + { + if (errno == ENOENT) + { + if (mkdir(GTMDataDir, 0700) != 0) + { + ereport(FATAL, + (errno, + errmsg("failed to create the directory \"%s\"", + GTMDataDir))); + } + goto retry; + } + else + ereport(FATAL, + (EPERM, + errmsg("could not read permissions of directory \"%s\": %m", + GTMDataDir))); + } + + /* eventual chdir would fail anyway, but let's test ... */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (EINVAL, + errmsg("specified data directory \"%s\" is not a directory", + GTMDataDir))); + + /* + * Check that the directory belongs to my userid; if not, reject. + * + * This check is an essential part of the interlock that prevents two + * postmasters from starting in the same directory (see CreateLockFile()). + * Do not remove or weaken it. + * + * XXX can we safely enable this check on Windows? + */ +#if !defined(WIN32) && !defined(__CYGWIN__) + if (stat_buf.st_uid != geteuid()) + ereport(FATAL, + (EINVAL, + errmsg("data directory \"%s\" has wrong ownership", + GTMDataDir), + errhint("The server must be started by the user that owns the data directory."))); +#endif +} + +/* + * Change working directory to DataDir. Most of the postmaster and backend + * code assumes that we are in DataDir so it can use relative paths to access + * stuff in and under the data directory. For convenience during path + * setup, however, we don't force the chdir to occur during SetDataDir. + */ +static void +ChangeToDataDir(void) +{ + if (chdir(GTMDataDir) < 0) + ereport(FATAL, + (EINVAL, + errmsg("could not change directory to \"%s\": %m", + GTMDataDir))); +} + +/* + * Create the data directory lockfile. + * + * When this is called, we must have already switched the working + * directory to DataDir, so we can just use a relative path. This + * helps ensure that we are locking the directory we should be. + */ +static void +CreateDataDirLockFile() +{ + CreateLockFile(GTM_PID_FILE, GTMDataDir); +} + +/* + * Create a lockfile. + * + * filename is the name of the lockfile to create. + * amPostmaster is used to determine how to encode the output PID. + * isDDLock and refName are used to determine what error message to produce. + */ +static void +CreateLockFile(const char *filename, const char *refName) +{ + int fd; + char buffer[MAXPGPATH + 100]; + int ntries; + int len; + int encoded_pid; + pid_t other_pid; + pid_t my_pid = getpid(); + + /* + * We need a loop here because of race conditions. But don't loop forever + * (for example, a non-writable $PGDATA directory might cause a failure + * that won't go away). 100 tries seems like plenty. + */ + for (ntries = 0;; ntries++) + { + /* + * Try to create the lock file --- O_EXCL makes this atomic. + * + * Think not to make the file protection weaker than 0600. See + * comments below. + */ + fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fd >= 0) + break; /* Success; exit the retry loop */ + + /* + * Couldn't create the pid file. Probably it already exists. + */ + if ((errno != EEXIST && errno != EACCES) || ntries > 100) + ereport(FATAL, + (EINVAL, + errmsg("could not create lock file \"%s\": %m", + filename))); + + /* + * Read the file to get the old owner's PID. Note race condition + * here: file might have been deleted since we tried to create it. + */ + fd = open(filename, O_RDONLY, 0600); + if (fd < 0) + { + if (errno == ENOENT) + continue; /* race condition; try again */ + ereport(FATAL, + (EINVAL, + errmsg("could not open lock file \"%s\": %m", + filename))); + } + if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0) + ereport(FATAL, + (EINVAL, + errmsg("could not read lock file \"%s\": %m", + filename))); + close(fd); + + buffer[len] = '\0'; + encoded_pid = atoi(buffer); + other_pid = (pid_t) encoded_pid; + + if (other_pid <= 0) + elog(FATAL, "bogus data in lock file \"%s\": \"%s\"", + filename, buffer); + + /* + * Check to see if the other process still exists + * + * If the PID in the lockfile is our own PID or our parent's PID, then + * the file must be stale (probably left over from a previous system + * boot cycle). We need this test because of the likelihood that a + * reboot will assign exactly the same PID as we had in the previous + * reboot. Also, if there is just one more process launch in this + * reboot than in the previous one, the lockfile might mention our + * parent's PID. We can reject that since we'd never be launched + * directly by a competing postmaster. We can't detect grandparent + * processes unfortunately, but if the init script is written + * carefully then all but the immediate parent shell will be + * root-owned processes and so the kill test will fail with EPERM. + * + * We can treat the EPERM-error case as okay because that error + * implies that the existing process has a different userid than we + * do, which means it cannot be a competing postmaster. A postmaster + * cannot successfully attach to a data directory owned by a userid + * other than its own. (This is now checked directly in + * checkDataDir(), but has been true for a long time because of the + * restriction that the data directory isn't group- or + * world-accessible.) Also, since we create the lockfiles mode 600, + * we'd have failed above if the lockfile belonged to another userid + * --- which means that whatever process kill() is reporting about + * isn't the one that made the lockfile. (NOTE: this last + * consideration is the only one that keeps us from blowing away a + * Unix socket file belonging to an instance of Postgres being run by + * someone else, at least on machines where /tmp hasn't got a + * stickybit.) + * + * Windows hasn't got getppid(), but doesn't need it since it's not + * using real kill() either... + * + * Normally kill() will fail with ESRCH if the given PID doesn't + * exist. + */ + if (other_pid != my_pid +#ifndef WIN32 + && other_pid != getppid() +#endif + ) + { + if (kill(other_pid, 0) == 0 || + (errno != ESRCH && errno != EPERM)) + { + /* lockfile belongs to a live process */ + ereport(FATAL, + (EINVAL, + errmsg("lock file \"%s\" already exists", + filename), + errhint("Is another GTM (PID %d) running in data directory \"%s\"?", + (int) other_pid, refName))); + } + } + + /* + * Looks like nobody's home. Unlink the file and try again to create + * it. Need a loop because of possible race condition against other + * would-be creators. + */ + if (unlink(filename) < 0) + ereport(FATAL, + (EACCES, + errmsg("could not remove old lock file \"%s\": %m", + filename), + errhint("The file seems accidentally left over, but " + "it could not be removed. Please remove the file " + "by hand and try again."))); + } + + /* + * Successfully created the file, now fill it. + */ + snprintf(buffer, sizeof(buffer), "%d\n%s\n", + (int) my_pid, GTMDataDir); + errno = 0; + if (write(fd, buffer, strlen(buffer)) != strlen(buffer)) + { + int save_errno = errno; + + close(fd); + unlink(filename); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(FATAL, + (EACCES, + errmsg("could not write lock file \"%s\": %m", filename))); + } + if (close(fd)) + { + int save_errno = errno; + + unlink(filename); + errno = save_errno; + ereport(FATAL, + (EACCES, + errmsg("could not write lock file \"%s\": %m", filename))); + } +} + +/* + * Create the opts file + */ +static bool +CreateOptsFile(int argc, char *argv[]) +{ + FILE *fp; + int i; + +#define OPTS_FILE "gtm.opts" + + if ((fp = fopen(OPTS_FILE, "w")) == NULL) + { + elog(LOG, "could not create file \"%s\": %m", OPTS_FILE); + return false; + } + + for (i = 1; i < argc; i++) + fprintf(fp, " \"%s\"", argv[i]); + fputs("\n", fp); + + if (fclose(fp)) + { + elog(LOG, "could not write file \"%s\": %m", OPTS_FILE); + return false; + } + + return true; +} + +/* delete pid file */ +static void +DeleteLockFile(const char *filename) +{ + if (unlink(filename) < 0) + ereport(FATAL, + (EACCES, + errmsg("could not remove old lock file \"%s\": %m", + filename), + errhint("The file seems accidentally left over, but " + "it could not be removed. Please remove the file " + "by hand and try again."))); +} diff --git a/src/gtm/path/Makefile b/src/gtm/path/Makefile new file mode 100644 index 0000000000..802ae3b9f9 --- /dev/null +++ b/src/gtm/path/Makefile @@ -0,0 +1,21 @@ +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +NAME=gtmpath +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 + +OBJS=path.o + +all:all-lib + +include $(top_build_dir)/Makefile.shlib + +clean: + rm -f $(OBJS) + rm -f libgtmpath.so libgtmpath.so.1 libgtmpath.so.1.0 + +distclean: clean + +maintainer-clean: distclean + diff --git a/src/gtm/path/path.c b/src/gtm/path/path.c new file mode 100644 index 0000000000..ea0eb6dbf2 --- /dev/null +++ b/src/gtm/path/path.c @@ -0,0 +1,177 @@ +/*------------------------------------------------------------------------- + * + * path.c + * portable path handling routines + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" + +#include <ctype.h> +#include <sys/stat.h> +#include <string.h> +#include <stdio.h> + +#include <gtm/path.h> + +#define IS_DIR_SEP(ch) ((ch) == '/' || (ch) == '\\') + +#define skip_drive(path) (path) + +static void trim_directory(char *path); +static void trim_trailing_separator(char *path); + +/* + * Clean up path by: + * o remove trailing slash + * o remove duplicate adjacent separators + * o remove trailing '.' + * o process trailing '..' ourselves + */ +void +canonicalize_path(char *path) +{ + char *p, + *to_p; + char *spath; + bool was_sep = false; + int pending_strips; + + /* + * Removing the trailing slash on a path means we never get ugly double + * trailing slashes. Also, Win32 can't stat() a directory with a trailing + * slash. Don't remove a leading slash, though. + */ + trim_trailing_separator(path); + + /* + * Remove duplicate adjacent separators + */ + p = path; + + to_p = p; + for (; *p; p++, to_p++) + { + /* Handle many adjacent slashes, like "/a///b" */ + while (*p == '/' && was_sep) + p++; + if (to_p != p) + *to_p = *p; + was_sep = (*p == '/'); + } + *to_p = '\0'; + + /* + * Remove any trailing uses of "." and process ".." ourselves + * + * Note that "/../.." should reduce to just "/", while "../.." has to be + * kept as-is. In the latter case we put back mistakenly trimmed ".." + * components below. Also note that we want a Windows drive spec to be + * visible to trim_directory(), but it's not part of the logic that's + * looking at the name components; hence distinction between path and + * spath. + */ + spath = skip_drive(path); + pending_strips = 0; + for (;;) + { + int len = strlen(spath); + + if (len >= 2 && strcmp(spath + len - 2, "/.") == 0) + trim_directory(path); + else if (strcmp(spath, ".") == 0) + { + /* Want to leave "." alone, but "./.." has to become ".." */ + if (pending_strips > 0) + *spath = '\0'; + break; + } + else if ((len >= 3 && strcmp(spath + len - 3, "/..") == 0) || + strcmp(spath, "..") == 0) + { + trim_directory(path); + pending_strips++; + } + else if (pending_strips > 0 && *spath != '\0') + { + /* trim a regular directory name cancelled by ".." */ + trim_directory(path); + pending_strips--; + /* foo/.. should become ".", not empty */ + if (*spath == '\0') + strcpy(spath, "."); + } + else + break; + } + + if (pending_strips > 0) + { + /* + * We could only get here if path is now totally empty (other than a + * possible drive specifier on Windows). We have to put back one or + * more ".."'s that we took off. + */ + while (--pending_strips > 0) + strcat(path, "../"); + strcat(path, ".."); + } +} + +/* + * trim_directory + * + * Trim trailing directory from path, that is, remove any trailing slashes, + * the last pathname component, and the slash just ahead of it --- but never + * remove a leading slash. + */ +static void +trim_directory(char *path) +{ + char *p; + + path = skip_drive(path); + + if (path[0] == '\0') + return; + + /* back up over trailing slash(es) */ + for (p = path + strlen(path) - 1; IS_DIR_SEP(*p) && p > path; p--) + ; + /* back up over directory name */ + for (; !IS_DIR_SEP(*p) && p > path; p--) + ; + /* if multiple slashes before directory name, remove 'em all */ + for (; p > path && IS_DIR_SEP(*(p - 1)); p--) + ; + /* don't erase a leading slash */ + if (p == path && IS_DIR_SEP(*p)) + p++; + *p = '\0'; +} + +/* + * trim_trailing_separator + * + * trim off trailing slashes, but not a leading slash + */ +static void +trim_trailing_separator(char *path) +{ + char *p; + + path = skip_drive(path); + p = path + strlen(path); + if (p > path) + for (p--; p > path && IS_DIR_SEP(*p); p--) + *p = '\0'; +} diff --git a/src/gtm/proxy/Makefile b/src/gtm/proxy/Makefile new file mode 100644 index 0000000000..3ed6ccce13 --- /dev/null +++ b/src/gtm/proxy/Makefile @@ -0,0 +1,22 @@ +# Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + +top_build_dir=../.. +include $(top_build_dir)/gtm/Makefile.global + +OBJS=proxy_main.o proxy_thread.o ../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a +LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq + +LIBS=-lpthread + +gtm_proxy:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ -o gtm_proxy + +all:gtm_proxy + +clean: + rm -f $(OBJS) + rm -f gtm_proxy + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c new file mode 100644 index 0000000000..75c7baf063 --- /dev/null +++ b/src/gtm/proxy/proxy_main.c @@ -0,0 +1,2016 @@ +/*------------------------------------------------------------------------- + * + * proxy_main.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include <time.h> +#include <unistd.h> +#include <signal.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <getopt.h> + +#include "gtm/gtm_c.h" +#include "gtm/gtm_proxy.h" +#include "gtm/elog.h" +#include "gtm/memutils.h" +#include "gtm/gtm_list.h" +#include "gtm/libpq.h" +#include "gtm/libpq-be.h" +#include "gtm/libpq-fe.h" +#include "gtm/pqsignal.h" +#include "gtm/pqformat.h" +#include "gtm/assert.h" +#include "gtm/gtm_txn.h" +#include "gtm/gtm_seq.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq-int.h" + +extern int optind; +extern char *optarg; + +#define GTM_MAX_PATH 1024 +#define GTM_PROXY_DEFAULT_HOSTNAME "*" +#define GTM_PROXY_DEFAULT_PORT 6666 +#define GTM_PROXY_DEFAULT_WORKERS 2 +#define GTM_PID_FILE "gtm_proxy.pid" +#define GTM_LOG_FILE "gtm_proxy.log" + +static char *progname = "gtm_proxy"; +char *ListenAddresses; +int GTMProxyPortNumber; +int GTMProxyWorkerThreads; +char *GTMProxyDataDir; + +char *GTMServerHost; +int GTMServerPortNumber; + +/* The socket(s) we're listening to. */ +#define MAXLISTEN 64 +static int ListenSocket[MAXLISTEN]; + +pthread_key_t threadinfo_key; +static bool GTMProxyAbortPending = false; + +static Port *ConnCreate(int serverFd); +static void ConnFree(Port *conn); +static int ServerLoop(void); +static int initMasks(fd_set *rmask); +void *GTMProxy_ThreadMain(void *argp); +static int GTMProxyAddConnection(Port *port); +static int ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf); +static void GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo); +static void GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn); + +static void GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); + +static void ProcessCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, StringInfo input_message); +static void ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); +static void ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); +static void ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); +static void ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); + +static void GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, + GTM_CoordinatorId coordinator_id); +static void GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, + GTM_CoordinatorId coordinator_id); + +static void ProcessResponse(GTMProxy_ThreadInfo *thrinfo, + GTMProxy_CommandInfo *cmdinfo, GTM_Result *res); + +static void GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo); +static void GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, + GTM_MessageType mtype, GTMProxy_CommandData cmd_data); + +static bool CreateOptsFile(int argc, char *argv[]); +static void CreateDataDirLockFile(void); +static void CreateLockFile(const char *filename, const char *refName); +static void ChangeToDataDir(void); +static void checkDataDir(void); +static void DeleteLockFile(const char *filename); + +/* + * One-time initialization. It's called immediately after the main process + * starts + */ +static GTMProxy_ThreadInfo * +MainThreadInit() +{ + GTMProxy_ThreadInfo *thrinfo; + + pthread_key_create(&threadinfo_key, NULL); + + /* + * Initialize the lock protecting the global threads info + */ + GTM_RWLockInit(>MProxyThreads->gt_lock); + + /* + * We are called even before memory context management is setup. We must + * use malloc + */ + thrinfo = (GTMProxy_ThreadInfo *)malloc(sizeof (GTMProxy_ThreadInfo)); + + if (thrinfo == NULL) + { + fprintf(stderr, "malloc failed: %d", errno); + fflush(stdout); + fflush(stderr); + } + + if (SetMyThreadInfo(thrinfo)) + { + fprintf(stderr, "SetMyThreadInfo failed: %d", errno); + fflush(stdout); + fflush(stderr); + } + + return thrinfo; +} + +static void +BaseInit() +{ + GTMProxy_ThreadInfo *thrinfo; + + thrinfo = MainThreadInit(); + + MyThreadID = pthread_self(); + + MemoryContextInit(); + + checkDataDir(); + ChangeToDataDir(); + CreateDataDirLockFile(); + + if (GTMLogFile == NULL) + { + GTMLogFile = (char *) malloc(GTM_MAX_PATH); + sprintf(GTMLogFile, "%s/%s", GTMProxyDataDir, GTM_LOG_FILE); + } + + DebugFileOpen(); + + /* + * The memory context is now set up. + * Add the thrinfo structure in the global array + */ + if (GTMProxy_ThreadAdd(thrinfo) == -1) + { + fprintf(stderr, "GTMProxy_ThreadAdd for main thread failed: %d", errno); + fflush(stdout); + fflush(stderr); + } +} + +static void +GTMProxy_SigleHandler(int signal) +{ + fprintf(stderr, "Received signal %d", signal); + + switch (signal) + { + case SIGKILL: + case SIGTERM: + case SIGQUIT: + case SIGINT: + case SIGHUP: + break; + + default: + fprintf(stderr, "Unknown signal %d\n", signal); + return; + } + + /* + * XXX We should do a clean shutdown here. + */ + /* Delete pid file before shutting down */ + DeleteLockFile(GTM_PID_FILE); + + PG_SETMASK(&BlockSig); + GTMProxyAbortPending = true; + + return; +} + +/* + * Help display should match + */ +static void +help(const char *progname) +{ + printf(_("This is the GTM proxy.\n\n")); + printf(_("Usage:\n %s [OPTION]...\n\n"), progname); + printf(_("Options:\n")); + printf(_(" -h hostname GTM proxy hostname/IP\n")); + printf(_(" -p port GTM proxy port number\n")); + printf(_(" -s hostname GTM server hostname/IP \n")); + printf(_(" -t port GTM server port number\n")); + printf(_(" -n count Number of worker threads\n")); + printf(_(" -D directory GTM proxy working directory\n")); + printf(_(" -l filename GTM proxy log file name \n")); + printf(_(" --help show this help, then exit\n")); +} + + +int +main(int argc, char *argv[]) +{ + int opt; + int status; + int i; + + /* + * Catch standard options before doing much else + */ + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + help(argv[0]); + exit(0); + } + } + + ListenAddresses = GTM_PROXY_DEFAULT_HOSTNAME; + GTMProxyPortNumber = GTM_PROXY_DEFAULT_PORT; + GTMProxyWorkerThreads = GTM_PROXY_DEFAULT_WORKERS; + + /* + * Parse the command like options and set variables + */ + while ((opt = getopt(argc, argv, "h:p:n:D:l:s:t:")) != -1) + { + switch (opt) + { + case 'h': + /* Listen address of the proxy */ + ListenAddresses = strdup(optarg); + break; + + case 'p': + /* Port number for the proxy to listen on */ + GTMProxyPortNumber = atoi(optarg); + break; + + case 'n': + /* Number of worker threads */ + GTMProxyWorkerThreads = atoi(optarg); + break; + + case 'D': + GTMProxyDataDir = strdup(optarg); + canonicalize_path(GTMProxyDataDir); + break; + + case 'l': + /* The log file */ + GTMLogFile = strdup(optarg); + break; + + case 's': + /* GTM server host name */ + GTMServerHost = strdup(optarg); + break; + + case 't': + /* GTM server port number */ + GTMServerPortNumber = atoi(optarg); + break; + + default: + write_stderr("Try \"%s --help\" for more information.\n", + progname); + } + } + + if (GTMProxyDataDir == NULL) + { + write_stderr("GTM Proxy data directory must be specified\n"); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + exit(1); + } + /* + * GTM accepts no non-option switch arguments. + */ + if (optind < argc) + { + write_stderr("%s: invalid argument: \"%s\"\n", + progname, argv[optind]); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + exit(1); + } + + /* + * Some basic initialization must happen before we do anything + * useful + */ + BaseInit(); + + elog(DEBUG3, "Starting GTM proxy at (%s:%d)", ListenAddresses, GTMProxyPortNumber); + + /* + * Establish input sockets. + */ + for (i = 0; i < MAXLISTEN; i++) + ListenSocket[i] = -1; + + if (ListenAddresses) + { + int success = 0; + + status = StreamServerPort(AF_UNSPEC, ListenAddresses, + (unsigned short) GTMProxyPortNumber, + ListenSocket, MAXLISTEN); + if (status == STATUS_OK) + success++; + else + ereport(FATAL, + (errmsg("could not create listen socket for \"%s\"", + ListenAddresses))); + } + + /* + * check that we have some socket to listen on + */ + if (ListenSocket[0] == -1) + ereport(FATAL, + (errmsg("no socket created for listening"))); + + /* + * Record gtm proxy options. We delay this till now to avoid recording + * bogus options + */ + if (!CreateOptsFile(argc, argv)) + exit(1); + + pqsignal(SIGHUP, GTMProxy_SigleHandler); + pqsignal(SIGKILL, GTMProxy_SigleHandler); + pqsignal(SIGQUIT, GTMProxy_SigleHandler); + pqsignal(SIGTERM, GTMProxy_SigleHandler); + pqsignal(SIGINT, GTMProxy_SigleHandler); + + pqinitmask(); + + /* + * Pre-fork so many worker threads + */ + + for (i = 0; i < GTMProxyWorkerThreads; i++) + { + /* + * XXX Start the worker thread + */ + if (GTMProxy_ThreadCreate(GTMProxy_ThreadMain) == NULL) + { + elog(ERROR, "failed to create a new thread"); + return STATUS_ERROR; + } + } + + /* + * Accept any new connections. Add for each incoming connection to one of + * the pre-forked threads. + */ + status = ServerLoop(); + + /* + * ServerLoop probably shouldn't ever return, but if it does, close down. + */ + exit(status != STATUS_OK); + + return 0; /* not reached */ +} + +/* + * ConnCreate -- create a local connection data structure + */ +static Port * +ConnCreate(int serverFd) +{ + Port *port; + + if (!(port = (Port *) calloc(1, sizeof(Port)))) + { + ereport(LOG, + (ENOMEM, + errmsg("out of memory"))); + exit(1); + } + + if (StreamConnection(serverFd, port) != STATUS_OK) + { + if (port->sock >= 0) + StreamClose(port->sock); + ConnFree(port); + port = NULL; + } + + port->conn_id = InvalidGTMProxyConnID; + + return port; +} + +/* + * ConnFree -- free a local connection data structure + */ +static void +ConnFree(Port *conn) +{ + free(conn); +} + +/* + * Main idle loop of postmaster + */ +static int +ServerLoop(void) +{ + fd_set readmask; + int nSockets; + + nSockets = initMasks(&readmask); + + for (;;) + { + fd_set rmask; + int selres; + + /* + * Wait for a connection request to arrive. + * + * We wait at most one minute, to ensure that the other background + * tasks handled below get done even when no requests are arriving. + */ + memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set)); + + PG_SETMASK(&UnBlockSig); + + if (GTMProxyAbortPending) + { + /* + * Tell everybody that we are shutting down + * + * !! TODO + */ + exit(1); + } + + { + /* must set timeout each time; some OSes change it! */ + struct timeval timeout; + + timeout.tv_sec = 60; + timeout.tv_usec = 0; + + selres = select(nSockets, &rmask, NULL, NULL, &timeout); + } + + /* + * Block all signals until we wait again. (This makes it safe for our + * signal handlers to do nontrivial work.) + */ + PG_SETMASK(&BlockSig); + + /* Now check the select() result */ + if (selres < 0) + { + if (errno != EINTR && errno != EWOULDBLOCK) + { + ereport(LOG, + (EACCES, + errmsg("select() failed in postmaster: %m"))); + return STATUS_ERROR; + } + } + + /* + * New connection pending on any of our sockets? If so, accept the + * connection and add it to one of the worker threads. + */ + if (selres > 0) + { + int i; + + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] == -1) + break; + if (FD_ISSET(ListenSocket[i], &rmask)) + { + Port *port; + + port = ConnCreate(ListenSocket[i]); + if (port) + { + if (GTMProxyAddConnection(port) != STATUS_OK) + { + elog(ERROR, "Too many connections"); + StreamClose(port->sock); + ConnFree(port); + } + } + } + } + } + } +} + +/* + * Initialise the masks for select() for the ports we are listening on. + * Return the number of sockets to listen on. + */ +static int +initMasks(fd_set *rmask) +{ + int maxsock = -1; + int i; + + FD_ZERO(rmask); + + for (i = 0; i < MAXLISTEN; i++) + { + int fd = ListenSocket[i]; + + if (fd == -1) + break; + FD_SET(fd, rmask); + if (fd > maxsock) + maxsock = fd; + } + + return maxsock + 1; +} + +/* + * The main worker thread routine + */ +void * +GTMProxy_ThreadMain(void *argp) +{ + GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp; + int qtype; + StringInfoData input_message; + sigjmp_buf local_sigjmp_buf; + int32 saved_seqno = -1; + int ii, nrfds; + char gtm_connect_string[1024]; + + elog(DEBUG3, "Starting the connection helper thread"); + + + /* + * Create the memory context we will use in the main loop. + * + * MessageContext is reset once per iteration of the main loop, ie, upon + * completion of processing of each command message from the client. + * + * This context is thread-specific + */ + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + /* + * Set up connection with the GTM server + */ + sprintf(gtm_connect_string, "host=%s port=%d coordinator_id=1 proxy=1", + GTMServerHost, GTMServerPortNumber); + + thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string); + + if (thrinfo->thr_gtm_conn == NULL) + elog(FATAL, "GTM connection failed"); + + /* + * Get the input_message in the TopMemoryContext so that we don't need to + * free/palloc it for every incoming message. Unlike Postgres, we don't + * expect the incoming messages to be of arbitrary sizes + */ + + initStringInfo(&input_message); + + /* + * If an exception is encountered, processing resumes here so we abort the + * current transaction and start a new one. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + */ + + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* + * NOTE: if you are tempted to add more code in this if-block, + * consider the high probability that it should be in + * AbortTransaction() instead. The only stuff done directly here + * should be stuff that is guaranteed to apply *only* for outer-level + * error recovery, such as adjusting the FE/BE protocol status. + */ + + /* Report the error to the client and/or server log */ + if (thrinfo->thr_conn_count > 0) + { + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; + /* + * Now clean up disconnected connections + */ + if (conninfo->con_disconnected) + { + GTMProxy_ThreadRemoveConnection(thrinfo, conninfo); + pfree(conninfo); + ii--; + } + else + { + /* + * Consume all the pending data on this connection and send + * error report + */ + if (conninfo->con_pending_msg != MSG_TYPE_INVALID) + { + conninfo->con_port->PqRecvPointer = conninfo->con_port->PqRecvLength = 0; + conninfo->con_pending_msg = MSG_TYPE_INVALID; + EmitErrorReport(conninfo->con_port); + } + } + } + } + else + EmitErrorReport(NULL); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(TopMemoryContext); + FlushErrorState(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + for (;;) + { + ListCell *elem = NULL; + GTM_Result *res = NULL; + + /* + * Release storage left over from prior query cycle, and create a new + * query input buffer in the cleared MessageContext. + */ + MemoryContextSwitchTo(MessageContext); + MemoryContextResetAndDeleteChildren(MessageContext); + + /* + * Just reset the input buffer to avoid repeated palloc/pfrees + * + * XXX We should consider resetting the MessageContext periodically to + * handle any memory leaks + */ + resetStringInfo(&input_message); + + /* + * Check if there are any changes to the connection array assigned to + * this thread. If so, we need to rebuild the fd array. + */ + GTM_MutexLockAcquire(&thrinfo->thr_lock); + if (saved_seqno != thrinfo->thr_seqno) + { + saved_seqno = thrinfo->thr_seqno; + + while (thrinfo->thr_conn_count <= 0) + { + /* + * No connections assigned to the thread. Wait for at least one + * connection to be assgined to us + */ + GTM_CVWait(&thrinfo->thr_cv, &thrinfo->thr_lock); + } + + memset(thrinfo->thr_poll_fds, 0, sizeof (thrinfo->thr_poll_fds)); + + /* + * Now grab all the open connections. We are holding the lock so no + * new connections can be added. + */ + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; + + /* We detect if the connection has been dropped to avoid + * a segmentation fault. + */ + if (conninfo->con_port == NULL) + { + conninfo->con_disconnected = true; + continue; + } + + /* + * If this is a newly added connection, complete the handshake + */ + if (!conninfo->con_authenticated) + GTMProxy_HandshakeConnection(conninfo); + + thrinfo->thr_poll_fds[ii].fd = conninfo->con_port->sock; + thrinfo->thr_poll_fds[ii].events = POLLIN; + thrinfo->thr_poll_fds[ii].revents = 0; + } + } + GTM_MutexLockRelease(&thrinfo->thr_lock); + + while (true) + { + nrfds = poll(thrinfo->thr_poll_fds, thrinfo->thr_conn_count, 1000); + + if (nrfds < 0) + { + if (errno == EINTR) + continue; + elog(FATAL, "poll returned with error %d", nrfds); + } + else + break; + } + + if (nrfds == 0) + continue; + + /* + * Initialize the lists + */ + thrinfo->thr_processed_commands = NIL; + memset(thrinfo->thr_pending_commands, 0, sizeof (thrinfo->thr_pending_commands)); + + /* + * Now, read command from each of the connections that has some data to + * be read. + */ + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; + thrinfo->thr_conn = conninfo; + + if (thrinfo->thr_poll_fds[ii].revents & POLLHUP) + { + /* + * The fd has become invalid. The connection is broken. Add it + * to the remove_list and cleanup at the end of this round of + * cleanup. + */ + GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn); + continue; + } + + if (thrinfo->thr_poll_fds[ii].revents & POLLIN) + { + /* + * (3) read a command (loop blocks here) + */ + qtype = ReadCommand(thrinfo->thr_conn, &input_message); + + switch(qtype) + { + case 'C': + ProcessCommand(thrinfo->thr_conn, thrinfo->thr_gtm_conn, + &input_message); + break; + + case 'X': + case EOF: + /* + * Connection termination request + * + * Close the socket and remember the connection + * as disconnected. All such connections will be + * removed after the command processing is over. We + * can't remove it just yet because we pass the slot id + * to the server to quickly find the backend connection + * while processing proxied messages. + */ + GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn); + break; + default: + /* + * Also disconnect if protocol error + */ + GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn); + elog(ERROR, "Unexpected message, or client disconnected abruptly."); + break; + } + + } + } + + /* + * Ok. All the commands are processed. Commands which can be proxied + * directly have been already sent to the GTM server. Now, group the + * remaining commands, send them to the server and flush the data. + */ + GTMProxy_ProcessPendingCommands(thrinfo); + + /* + * Add a special marker to tell the GTM server that we are done with + * one round of messages and the GTM server should flush all the + * pending responses after seeing this message. + */ + if (gtmpqPutMsgStart('F', true, thrinfo->thr_gtm_conn) || + gtmpqPutInt(MSG_DATA_FLUSH, sizeof (GTM_MessageType), thrinfo->thr_gtm_conn) || + gtmpqPutMsgEnd(thrinfo->thr_gtm_conn)) + elog(ERROR, "Error sending flush message"); + + /* + * Make sure everything is on wire now + */ + gtmpqFlush(thrinfo->thr_gtm_conn); + + /* + * Read back the responses and put them on to the right backend + * connection. + */ + foreach(elem, thrinfo->thr_processed_commands) + { + GTMProxy_CommandInfo *cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem); + + /* + * If this is a continuation of a multi-part command response, we + * don't need to read another result from the stream. The previous + * result contains our response and we should just read from it. + */ + if (cmdinfo->ci_res_index == 0) + { + if ((res = GTMPQgetResult(thrinfo->thr_gtm_conn)) == NULL) + elog(ERROR, "GTMPQgetResult failed"); + } + + ProcessResponse(thrinfo, cmdinfo, res); + } + + list_free_deep(thrinfo->thr_processed_commands); + thrinfo->thr_processed_commands = NIL; + + /* + * Now clean up disconnected connections + */ + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; + if (conninfo->con_disconnected) + { + GTMProxy_ThreadRemoveConnection(thrinfo, conninfo); + pfree(conninfo); + ii--; + } + } + } + + /* can't get here because the above loop never exits */ + Assert(false); + + return thrinfo; +} + +/* + * Add the accepted connection to the pool + */ +static int +GTMProxyAddConnection(Port *port) +{ + GTMProxy_ConnectionInfo *conninfo = NULL; + + conninfo = (GTMProxy_ConnectionInfo *)palloc0(sizeof (GTMProxy_ConnectionInfo)); + + if (conninfo == NULL) + { + ereport(ERROR, + (ENOMEM, + errmsg("Out of memory"))); + return STATUS_ERROR; + } + + elog(DEBUG3, "Started new connection"); + conninfo->con_port = port; + + /* + * Add the conninfo struct to the next worker thread in round-robin manner + */ + GTMProxy_ThreadAddConnection(conninfo); + + return STATUS_OK; +} + +void +ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + StringInfo input_message) +{ + GTM_MessageType mtype; + + mtype = pq_getmsgint(input_message, sizeof (GTM_MessageType)); + + switch (mtype) + { + case MSG_UNREGISTER_COORD: + ProcessCoordinatorCommand(conninfo, gtm_conn, mtype, input_message); + break; + + case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN_GETGXID: + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + case MSG_TXN_PREPARE: + case MSG_TXN_COMMIT: + case MSG_TXN_ROLLBACK: + case MSG_TXN_GET_GXID: + ProcessTransactionCommand(conninfo, gtm_conn, mtype, input_message); + break; + + case MSG_SNAPSHOT_GET: + case MSG_SNAPSHOT_GXID_GET: + ProcessSnapshotCommand(conninfo, gtm_conn, mtype, input_message); + break; + + case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: + case MSG_SEQUENCE_GET_NEXT: + case MSG_SEQUENCE_RESET: + case MSG_SEQUENCE_CLOSE: + ProcessSeqeunceCommand(conninfo, gtm_conn, mtype, input_message); + break; + + default: + ereport(FATAL, + (EPROTO, + errmsg("invalid frontend message type %d", + mtype))); + } + + conninfo->con_pending_msg = mtype; +} + +static void +ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, + GTM_Result *res) +{ + StringInfoData buf; + GlobalTransactionId gxid; + + switch (cmdinfo->ci_mtype) + { + case MSG_TXN_BEGIN_GETGXID: + /* + * This is a grouped command. We send just the transaction count to + * the GTM server which responds back with the start GXID. We + * derive our GXID from the start GXID and the our position in the + * command queue + */ + if (res->gr_status == 0) + { + if (res->gr_type != TXN_BEGIN_GETGXID_MULTI_RESULT) + elog(ERROR, "Wrong result"); + if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_get_multi.txn_count) + elog(ERROR, "Too few GXIDs"); + + gxid = res->gr_resdata.grd_txn_get_multi.start_gxid + cmdinfo->ci_res_index; + + /* Handle wraparound */ + if (gxid < res->gr_resdata.grd_txn_get_multi.start_gxid) + gxid += FirstNormalGlobalTransactionId; + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4); + pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId)); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + } + else + { + pq_beginmessage(&buf, 'E'); + pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + } + cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; + break; + + case MSG_TXN_COMMIT: + if (res->gr_type != TXN_COMMIT_MULTI_RESULT) + elog(ERROR, "Wrong result"); + /* + * These are grouped messages. We send an array of GXIDs to commit + * or rollback and the server sends us back an array of status + * codes. + */ + if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count) + elog(ERROR, "Too few GXIDs"); + + if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK) + { + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_COMMIT_RESULT, 4); + pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId)); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + } + else + ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed"))); + cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; + break; + + case MSG_TXN_ROLLBACK: + if (res->gr_type != TXN_ROLLBACK_MULTI_RESULT) + elog(ERROR, "Wrong result"); + /* + * These are grouped messages. We send an array of GXIDs to commit + * or rollback and the server sends us back an array of status + * codes. + */ + if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_rc_multi.txn_count) + elog(ERROR, "Too few GXIDs"); + + if (res->gr_resdata.grd_txn_rc_multi.status[cmdinfo->ci_res_index] == STATUS_OK) + { + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4); + pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_rc.gxid, sizeof (GlobalTransactionId)); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + } + else + ereport(ERROR2, (EINVAL, errmsg("Transaction commit failed"))); + cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; + break; + + case MSG_SNAPSHOT_GET: + if ((res->gr_type != SNAPSHOT_GET_RESULT) && + (res->gr_type != SNAPSHOT_GET_MULTI_RESULT)) + elog(ERROR, "Wrong result"); + + if (cmdinfo->ci_res_index >= res->gr_resdata.grd_txn_snap_multi.txn_count) + elog(ERROR, "Too few GXIDs"); + + if (res->gr_resdata.grd_txn_snap_multi.status[cmdinfo->ci_res_index] == STATUS_OK) + { + int txn_count = 1; + int status = STATUS_OK; + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SNAPSHOT_GET_RESULT, 4); + pq_sendbytes(&buf, (char *)&cmdinfo->ci_data.cd_snap.gxid, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&txn_count, sizeof (txn_count)); + pq_sendbytes(&buf, (char *)&status, sizeof (status)); + pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmin, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmax, sizeof (GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_recent_global_xmin, sizeof (GlobalTransactionId)); + pq_sendint(&buf, res->gr_snapshot.sn_xcnt, sizeof (int)); + pq_sendbytes(&buf, (char *)res->gr_snapshot.sn_xip, + sizeof(GlobalTransactionId) * res->gr_snapshot.sn_xcnt); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + } + else + ereport(ERROR2, (EINVAL, errmsg("snapshot request failed"))); + cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; + break; + + case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + case MSG_TXN_PREPARE: + case MSG_TXN_GET_GXID: + case MSG_SNAPSHOT_GXID_GET: + case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: + case MSG_SEQUENCE_GET_NEXT: + case MSG_SEQUENCE_RESET: + case MSG_SEQUENCE_CLOSE: + if ((res->gr_proxyhdr.ph_conid == InvalidGTMProxyConnID) || + (res->gr_proxyhdr.ph_conid >= GTM_PROXY_MAX_CONNECTIONS) || + (thrinfo->thr_all_conns[res->gr_proxyhdr.ph_conid] != cmdinfo->ci_conn)) + elog(PANIC, "Invalid response or synchronization loss"); + + /* + * These are just proxied messages.. so just forward the response + * back after stripping the conid part. + * + * !!TODO As we start adding support for message grouping for + * messages, those message types would be removed from the above + * and handled separately. + */ + switch (res->gr_status) + { + case 0: + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, res->gr_type, 4); + pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + break; + + default: + pq_beginmessage(&buf, 'E'); + pq_sendbytes(&buf, res->gr_proxy_data, res->gr_msglen); + pq_endmessage(cmdinfo->ci_conn->con_port, &buf); + pq_flush(cmdinfo->ci_conn->con_port); + break; + } + cmdinfo->ci_conn->con_pending_msg = MSG_TYPE_INVALID; + break; + + default: + ereport(FATAL, + (EPROTO, + errmsg("invalid frontend message type %d", + cmdinfo->ci_mtype))); + } + + +} + +/* ---------------- + * ReadCommand reads a command from either the frontend or + * standard input, places it in inBuf, and returns the + * message type code (first byte of the message). + * EOF is returned if end of file. + * ---------------- + */ +static int +ReadCommand(GTMProxy_ConnectionInfo *conninfo, StringInfo inBuf) +{ + int qtype; + + /* + * Get message type code from the frontend. + */ + qtype = pq_getbyte(conninfo->con_port); + + if (qtype == EOF) /* frontend disconnected */ + { + ereport(COMMERROR, + (EPROTO, + errmsg("unexpected EOF on client connection"))); + return qtype; + } + + /* + * Validate message type code before trying to read body; if we have lost + * sync, better to say "command unknown" than to run out of memory because + * we used garbage as a length word. + * + * This also gives us a place to set the doing_extended_query_message flag + * as soon as possible. + */ + switch (qtype) + { + case 'C': + break; + + case 'X': + break; + + default: + + /* + * Otherwise we got garbage from the frontend. We treat this as + * fatal because we have probably lost message boundary sync, and + * there's no good way to recover. + */ + ereport(ERROR, + (EPROTO, + errmsg("invalid frontend message type %d", qtype))); + + break; + } + + /* + * In protocol version 3, all frontend messages have a length word next + * after the type code; we can read the message contents independently of + * the type. + */ + if (pq_getmessage(conninfo->con_port, inBuf, 0)) + return EOF; /* suitable message already logged */ + + return qtype; +} + +static void +ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + GTM_MessageType mtype, StringInfo message) +{ + GTM_CoordinatorId cid; + + cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId)); + + switch (mtype) + { + case MSG_UNREGISTER_COORD: + GTMProxy_UnregisterCoordinator(conninfo, cid); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + pq_getmsgend(message); +} + +static void +ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + GTM_MessageType mtype, StringInfo message) +{ + GTMProxy_CommandData cmd_data; + + switch (mtype) + { + case MSG_TXN_BEGIN_GETGXID: + cmd_data.cd_beg.iso_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + cmd_data.cd_beg.rdonly = pq_getmsgbyte(message); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); + break; + + case MSG_TXN_COMMIT: + case MSG_TXN_ROLLBACK: + cmd_data.cd_rc.isgxid = pq_getmsgbyte(message); + if (cmd_data.cd_rc.isgxid) + { + const char *data = pq_getmsgbytes(message, + sizeof (GlobalTransactionId)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&cmd_data.cd_rc.gxid, data, sizeof (GlobalTransactionId)); + } + else + { + const char *data = pq_getmsgbytes(message, + sizeof (GTM_TransactionHandle)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&cmd_data.cd_rc.handle, data, sizeof (GTM_TransactionHandle)); + } + pq_getmsgend(message); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); + break; + + case MSG_TXN_BEGIN: + case MSG_TXN_GET_GXID: + elog(FATAL, "Support not yet added for these message types"); + break; + + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + case MSG_TXN_PREPARE: + GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } +} + +static void +ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + GTM_MessageType mtype, StringInfo message) +{ + bool canbe_grouped = false; + GTMProxy_CommandData cmd_data; + + switch (mtype) + { + case MSG_SNAPSHOT_GET: + canbe_grouped = pq_getmsgbyte(message); + if (!canbe_grouped) + GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); + else + { + cmd_data.cd_snap.isgxid = pq_getmsgbyte(message); + if (cmd_data.cd_snap.isgxid) + { + const char *data = pq_getmsgbytes(message, + sizeof (GlobalTransactionId)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&cmd_data.cd_snap.gxid, data, sizeof (GlobalTransactionId)); + } + else + { + const char *data = pq_getmsgbytes(message, + sizeof (GTM_TransactionHandle)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&cmd_data.cd_snap.handle, data, sizeof (GTM_TransactionHandle)); + } + pq_getmsgend(message); + GTMProxy_CommandPending(conninfo, mtype, cmd_data); + } + break; + + case MSG_SNAPSHOT_GXID_GET: + elog(ERROR, "Message not yet support"); + break; + + default: + Assert(0); /* Shouldn't come here.. keep compiler quite */ + } + +} + +static void +ProcessSeqeunceCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + GTM_MessageType mtype, StringInfo message) +{ + /* + * We proxy the Sequence messages as they are. Just add the connection + * identifier to it so that the response can be quickly sent back to the + * right backend. + * + * Write the message, but don't flush it just yet. + */ + return GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); +} + +/* + * Proxy the incoming message to the GTM server after adding our own identifier + * to it. The rest of the message is forwarded as it is without even reading + * its contents. + */ +static void +GTMProxy_ProxyCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, + GTM_MessageType mtype, StringInfo message) +{ + GTMProxy_CommandInfo *cmdinfo; + GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo; + GTM_ProxyMsgHeader proxyhdr; + + proxyhdr.ph_conid = conninfo->con_id; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, gtm_conn) || + gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) || + gtmpqPutInt(mtype, sizeof (GTM_MessageType), gtm_conn) || + gtmpqPutnchar(pq_getmsgbytes(message, pq_getmsgunreadlen(message)), + pq_getmsgunreadlen(message), gtm_conn)) + elog(ERROR, "Error proxing data"); + + /* + * Add the message to the pending command list + */ + cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo)); + cmdinfo->ci_mtype = mtype; + cmdinfo->ci_conn = conninfo; + cmdinfo->ci_res_index = 0; + thrinfo->thr_processed_commands = lappend(thrinfo->thr_processed_commands, cmdinfo); + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + return; +} + + +/* + * Record the incoming message as per its type. After all messages of this type + * are collected, they will be sent in a single message to the GTM server. + */ +static void +GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype, + GTMProxy_CommandData cmd_data) +{ + GTMProxy_CommandInfo *cmdinfo; + GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo; + + /* + * Add the message to the pending command list + */ + cmdinfo = palloc0(sizeof (GTMProxy_CommandInfo)); + cmdinfo->ci_mtype = mtype; + cmdinfo->ci_conn = conninfo; + cmdinfo->ci_res_index = 0; + cmdinfo->ci_data = cmd_data; + thrinfo->thr_pending_commands[mtype] = lappend(thrinfo->thr_pending_commands[mtype], cmdinfo); + + return; +} +static void +GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid) +{ + elog(DEBUG3, "Registering coordinator with cid %d", cid); + conninfo->con_port->coordinator_id = cid; +} + + +static void +GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid) +{ + /* + * Do a clean shutdown + */ + return; +} + + +static void +GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo) +{ + /* + * We expect a startup message at the very start. The message type is + * REGISTER_COORD, followed by the 4 byte coordinator ID + */ + char startup_type; + GTM_StartupPacket sp; + StringInfoData inBuf; + StringInfoData buf; + + startup_type = pq_getbyte(conninfo->con_port); + + if (startup_type != 'A') + ereport(ERROR, + (EPROTO, + errmsg("Expecting a startup message, but received %c", + startup_type))); + + initStringInfo(&inBuf); + + /* + * All frontend messages have a length word next + * after the type code; we can read the message contents independently of + * the type. + */ + if (pq_getmessage(conninfo->con_port, &inBuf, 0)) + ereport(ERROR, + (EPROTO, + errmsg("Expecting coordinator ID, but received EOF"))); + + memcpy(&sp, + pq_getmsgbytes(&inBuf, sizeof (GTM_StartupPacket)), + sizeof (GTM_StartupPacket)); + pq_getmsgend(&inBuf); + + GTMProxy_RegisterCoordinator(conninfo, sp.sp_cid); + + /* + * Send a dummy authentication request message 'R' as the client + * expects that in the current protocol + */ + pq_beginmessage(&buf, 'R'); + pq_endmessage(conninfo->con_port, &buf); + pq_flush(conninfo->con_port); + + conninfo->con_authenticated = true; + + elog(DEBUG3, "Sent connection authentication message to the client"); +} + +static void +GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn) +{ + GTM_ProxyMsgHeader proxyhdr; + + conninfo->con_disconnected = true; + if (conninfo->con_port->sock > 0) + StreamClose(conninfo->con_port->sock); + ConnFree(conninfo->con_port); + conninfo->con_port = NULL; + + proxyhdr.ph_conid = conninfo->con_id; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, gtm_conn) || + gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) || + gtmpqPutInt(MSG_BACKEND_DISCONNECT, sizeof (GTM_MessageType), gtm_conn)) + elog(ERROR, "Error proxing data"); + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + return; +} + +/* + * Proces all the pending messages now. + */ +static void +GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo) +{ + int ii; + GTMProxy_CommandInfo *cmdinfo = NULL; + GTM_ProxyMsgHeader proxyhdr; + GTM_Conn *gtm_conn = thrinfo->thr_gtm_conn; + ListCell *elem = NULL; + + for (ii = 0; ii < MSG_TYPE_COUNT; ii++) + { + int res_index = 0; + + if (list_length(thrinfo->thr_pending_commands[ii]) == 0) + continue; + + /* + * Start a new group message and fill in the headers + */ + proxyhdr.ph_conid = InvalidGTMProxyConnID; + + if (gtmpqPutMsgStart('C', true, gtm_conn) || + gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn)) + elog(ERROR, "Error proxing data"); + + switch (ii) + { + case MSG_TXN_BEGIN_GETGXID: + if (list_length(thrinfo->thr_pending_commands[ii]) <=0 ) + elog(PANIC, "No pending commands of type %d", ii); + + if (gtmpqPutInt(MSG_TXN_BEGIN_GETGXID_MULTI, sizeof (GTM_MessageType), gtm_conn) || + gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn)) + elog(ERROR, "Error sending data"); + foreach (elem, thrinfo->thr_pending_commands[ii]) + { + cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem); + Assert(cmdinfo->ci_mtype == ii); + cmdinfo->ci_res_index = res_index++; + if (gtmpqPutInt(cmdinfo->ci_data.cd_beg.iso_level, + sizeof (GTM_IsolationLevel), gtm_conn) || + gtmpqPutc(cmdinfo->ci_data.cd_beg.rdonly, gtm_conn) || + gtmpqPutInt(cmdinfo->ci_conn->con_id, sizeof (GTMProxy_ConnID), gtm_conn)) + elog(ERROR, "Error sending data"); + + } + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + /* + * Move the entire list to the processed command + */ + thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands, + thrinfo->thr_pending_commands[ii]); + thrinfo->thr_pending_commands[ii] = NIL; + break; + + case MSG_TXN_COMMIT: + if (gtmpqPutInt(MSG_TXN_COMMIT_MULTI, sizeof (GTM_MessageType), gtm_conn) || + gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn)) + elog(ERROR, "Error sending data"); + + foreach (elem, thrinfo->thr_pending_commands[ii]) + { + cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem); + Assert(cmdinfo->ci_mtype == ii); + cmdinfo->ci_res_index = res_index++; + if (cmdinfo->ci_data.cd_rc.isgxid) + { + if (gtmpqPutc(true, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid, + sizeof (GlobalTransactionId), gtm_conn)) + elog(ERROR, "Error sending data"); + } + else + { + if (gtmpqPutc(false, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle, + sizeof (GTM_TransactionHandle), gtm_conn)) + elog(ERROR, "Error sending data"); + } + } + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + /* + * Move the entire list to the processed command + */ + thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands, + thrinfo->thr_pending_commands[ii]); + thrinfo->thr_pending_commands[ii] = NIL; + break; + + break; + + case MSG_TXN_ROLLBACK: + if (gtmpqPutInt(MSG_TXN_ROLLBACK_MULTI, sizeof (GTM_MessageType), gtm_conn) || + gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn)) + elog(ERROR, "Error sending data"); + + foreach (elem, thrinfo->thr_pending_commands[ii]) + { + cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem); + Assert(cmdinfo->ci_mtype == ii); + cmdinfo->ci_res_index = res_index++; + if (cmdinfo->ci_data.cd_rc.isgxid) + { + if (gtmpqPutc(true, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid, + sizeof (GlobalTransactionId), gtm_conn)) + elog(ERROR, "Error sending data"); + } + else + { + if (gtmpqPutc(false, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle, + sizeof (GTM_TransactionHandle), gtm_conn)) + elog(ERROR, "Error sending data"); + } + } + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + + /* + * Move the entire list to the processed command + */ + thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands, + thrinfo->thr_pending_commands[ii]); + thrinfo->thr_pending_commands[ii] = NIL; + break; + + case MSG_SNAPSHOT_GET: + if (gtmpqPutInt(MSG_SNAPSHOT_GET_MULTI, sizeof (GTM_MessageType), gtm_conn) || + gtmpqPutInt(list_length(thrinfo->thr_pending_commands[ii]), sizeof(int), gtm_conn)) + elog(ERROR, "Error sending data"); + + foreach (elem, thrinfo->thr_pending_commands[ii]) + { + cmdinfo = (GTMProxy_CommandInfo *)lfirst(elem); + Assert(cmdinfo->ci_mtype == ii); + cmdinfo->ci_res_index = res_index++; + if (cmdinfo->ci_data.cd_rc.isgxid) + { + if (gtmpqPutc(true, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.gxid, + sizeof (GlobalTransactionId), gtm_conn)) + elog(ERROR, "Error sending data"); + } + else + { + if (gtmpqPutc(false, gtm_conn) || + gtmpqPutnchar((char *)&cmdinfo->ci_data.cd_rc.handle, + sizeof (GTM_TransactionHandle), gtm_conn)) + elog(ERROR, "Error sending data"); + } + } + + /* Finish the message. */ + if (gtmpqPutMsgEnd(gtm_conn)) + elog(ERROR, "Error finishing the message"); + + /* + * Move the entire list to the processed command + */ + thrinfo->thr_processed_commands = list_concat(thrinfo->thr_processed_commands, + thrinfo->thr_pending_commands[ii]); + thrinfo->thr_pending_commands[ii] = NIL; + break; + + + default: + elog(ERROR, "This message type (%d) can not be grouped together", ii); + } + + } +} + +/* + * Validate the proposed data directory + */ +static void +checkDataDir(void) +{ + struct stat stat_buf; + + Assert(GTMProxyDataDir); + +retry: + if (stat(GTMProxyDataDir, &stat_buf) != 0) + { + if (errno == ENOENT) + { + if (mkdir(GTMProxyDataDir, 0700) != 0) + { + ereport(FATAL, + (errno, + errmsg("failed to create the directory \"%s\"", + GTMProxyDataDir))); + } + goto retry; + } + else + ereport(FATAL, + (EPERM, + errmsg("could not read permissions of directory \"%s\": %m", + GTMProxyDataDir))); + } + + /* eventual chdir would fail anyway, but let's test ... */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (EINVAL, + errmsg("specified data directory \"%s\" is not a directory", + GTMProxyDataDir))); + + /* + * Check that the directory belongs to my userid; if not, reject. + * + * This check is an essential part of the interlock that prevents two + * postmasters from starting in the same directory (see CreateLockFile()). + * Do not remove or weaken it. + * + * XXX can we safely enable this check on Windows? + */ +#if !defined(WIN32) && !defined(__CYGWIN__) + if (stat_buf.st_uid != geteuid()) + ereport(FATAL, + (EINVAL, + errmsg("data directory \"%s\" has wrong ownership", + GTMProxyDataDir), + errhint("The server must be started by the user that owns the data directory."))); +#endif +} + +/* + * Change working directory to DataDir. Most of the postmaster and backend + * code assumes that we are in DataDir so it can use relative paths to access + * stuff in and under the data directory. For convenience during path + * setup, however, we don't force the chdir to occur during SetDataDir. + */ +static void +ChangeToDataDir(void) +{ + if (chdir(GTMProxyDataDir) < 0) + ereport(FATAL, + (EINVAL, + errmsg("could not change directory to \"%s\": %m", + GTMProxyDataDir))); +} + +/* + * Create the data directory lockfile. + * + * When this is called, we must have already switched the working + * directory to DataDir, so we can just use a relative path. This + * helps ensure that we are locking the directory we should be. + */ +static void +CreateDataDirLockFile() +{ + CreateLockFile(GTM_PID_FILE, GTMProxyDataDir); +} + +/* + * Create a lockfile. + * + * filename is the name of the lockfile to create. + * amPostmaster is used to determine how to encode the output PID. + * isDDLock and refName are used to determine what error message to produce. + */ +static void +CreateLockFile(const char *filename, const char *refName) +{ + int fd; + char buffer[MAXPGPATH + 100]; + int ntries; + int len; + int encoded_pid; + pid_t other_pid; + pid_t my_pid = getpid(); + + /* + * We need a loop here because of race conditions. But don't loop forever + * (for example, a non-writable $PGDATA directory might cause a failure + * that won't go away). 100 tries seems like plenty. + */ + for (ntries = 0;; ntries++) + { + /* + * Try to create the lock file --- O_EXCL makes this atomic. + * + * Think not to make the file protection weaker than 0600. See + * comments below. + */ + fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fd >= 0) + break; /* Success; exit the retry loop */ + + /* + * Couldn't create the pid file. Probably it already exists. + */ + if ((errno != EEXIST && errno != EACCES) || ntries > 100) + ereport(FATAL, + (EINVAL, + errmsg("could not create lock file \"%s\": %m", + filename))); + + /* + * Read the file to get the old owner's PID. Note race condition + * here: file might have been deleted since we tried to create it. + */ + fd = open(filename, O_RDONLY, 0600); + if (fd < 0) + { + if (errno == ENOENT) + continue; /* race condition; try again */ + ereport(FATAL, + (EINVAL, + errmsg("could not open lock file \"%s\": %m", + filename))); + } + if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0) + ereport(FATAL, + (EINVAL, + errmsg("could not read lock file \"%s\": %m", + filename))); + close(fd); + + buffer[len] = '\0'; + encoded_pid = atoi(buffer); + other_pid = (pid_t) encoded_pid; + + if (other_pid <= 0) + elog(FATAL, "bogus data in lock file \"%s\": \"%s\"", + filename, buffer); + + /* + * Check to see if the other process still exists + * + * If the PID in the lockfile is our own PID or our parent's PID, then + * the file must be stale (probably left over from a previous system + * boot cycle). We need this test because of the likelihood that a + * reboot will assign exactly the same PID as we had in the previous + * reboot. Also, if there is just one more process launch in this + * reboot than in the previous one, the lockfile might mention our + * parent's PID. We can reject that since we'd never be launched + * directly by a competing postmaster. We can't detect grandparent + * processes unfortunately, but if the init script is written + * carefully then all but the immediate parent shell will be + * root-owned processes and so the kill test will fail with EPERM. + * + * We can treat the EPERM-error case as okay because that error + * implies that the existing process has a different userid than we + * do, which means it cannot be a competing postmaster. A postmaster + * cannot successfully attach to a data directory owned by a userid + * other than its own. (This is now checked directly in + * checkDataDir(), but has been true for a long time because of the + * restriction that the data directory isn't group- or + * world-accessible.) Also, since we create the lockfiles mode 600, + * we'd have failed above if the lockfile belonged to another userid + * --- which means that whatever process kill() is reporting about + * isn't the one that made the lockfile. (NOTE: this last + * consideration is the only one that keeps us from blowing away a + * Unix socket file belonging to an instance of Postgres being run by + * someone else, at least on machines where /tmp hasn't got a + * stickybit.) + * + * Windows hasn't got getppid(), but doesn't need it since it's not + * using real kill() either... + * + * Normally kill() will fail with ESRCH if the given PID doesn't + * exist. + */ + if (other_pid != my_pid +#ifndef WIN32 + && other_pid != getppid() +#endif + ) + { + if (kill(other_pid, 0) == 0 || + (errno != ESRCH && errno != EPERM)) + { + /* lockfile belongs to a live process */ + ereport(FATAL, + (EINVAL, + errmsg("lock file \"%s\" already exists", + filename), + errhint("Is another GTM proxy (PID %d) running in data directory \"%s\"?", + (int) other_pid, refName))); + } + } + + /* + * Looks like nobody's home. Unlink the file and try again to create + * it. Need a loop because of possible race condition against other + * would-be creators. + */ + if (unlink(filename) < 0) + ereport(FATAL, + (EACCES, + errmsg("could not remove old lock file \"%s\": %m", + filename), + errhint("The file seems accidentally left over, but " + "it could not be removed. Please remove the file " + "by hand and try again."))); + } + + /* + * Successfully created the file, now fill it. + */ + snprintf(buffer, sizeof(buffer), "%d\n%s\n", + (int) my_pid, GTMProxyDataDir); + errno = 0; + if (write(fd, buffer, strlen(buffer)) != strlen(buffer)) + { + int save_errno = errno; + + close(fd); + unlink(filename); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(FATAL, + (EACCES, + errmsg("could not write lock file \"%s\": %m", filename))); + } + if (close(fd)) + { + int save_errno = errno; + + unlink(filename); + errno = save_errno; + ereport(FATAL, + (EACCES, + errmsg("could not write lock file \"%s\": %m", filename))); + } + +} + +/* + * Create the opts file + */ +static bool +CreateOptsFile(int argc, char *argv[]) +{ + FILE *fp; + int i; + +#define OPTS_FILE "gtm_proxy.opts" + + if ((fp = fopen(OPTS_FILE, "w")) == NULL) + { + elog(LOG, "could not create file \"%s\": %m", OPTS_FILE); + return false; + } + + for (i = 1; i < argc; i++) + fprintf(fp, " \"%s\"", argv[i]); + fputs("\n", fp); + + if (fclose(fp)) + { + elog(LOG, "could not write file \"%s\": %m", OPTS_FILE); + return false; + } + + return true; +} + +/* delete pid file */ +static void +DeleteLockFile(const char *filename) +{ + if (unlink(filename) < 0) + ereport(FATAL, + (EACCES, + errmsg("could not remove old lock file \"%s\": %m", + filename), + errhint("The file seems accidentally left over, but " + "it could not be removed. Please remove the file " + "by hand and try again."))); +} diff --git a/src/gtm/proxy/proxy_thread.c b/src/gtm/proxy/proxy_thread.c new file mode 100644 index 0000000000..844f2f70b4 --- /dev/null +++ b/src/gtm/proxy/proxy_thread.c @@ -0,0 +1,451 @@ +/*------------------------------------------------------------------------- + * + * proxy_thread.c + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include <pthread.h> +#include "gtm/gtm_proxy.h" +#include "gtm/memutils.h" +#include "gtm/libpq.h" + +static void *GTMProxy_ThreadMainWrapper(void *argp); +static void GTMProxy_ThreadCleanup(void *argp); + +GTMProxy_Threads GTMProxyThreadsData; +GTMProxy_Threads *GTMProxyThreads = >MProxyThreadsData; + +#define GTM_PROXY_MIN_THREADS 32 /* Provision for minimum threads */ +#define GTM_PROXY_MAX_THREADS 1024 /* Max threads allowed in the GTMProxy */ +#define GTMProxyThreadsFull (GTMProxyThreads->gt_thread_count == GTMProxyThreads->gt_array_size) + +/* + * Add the given thrinfo structure to the global array, expanding it if + * necessary + */ +int +GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo) +{ + int ii; + + GTM_RWLockAcquire(>MProxyThreads->gt_lock, GTM_LOCKMODE_WRITE); + + if (GTMProxyThreadsFull) + { + GTMProxy_ThreadInfo **threads; + uint32 newsize; + + /* + * TODO Optimize lock management by not holding any locks during memory + * allocation + */ + if (GTMProxyThreads->gt_array_size == GTM_PROXY_MAX_THREADS) + elog(ERROR, "Too many threads active"); + + if (GTMProxyThreads->gt_array_size == 0) + newsize = GTM_PROXY_MIN_THREADS; + else + { + /* + * We ran out of the array size. Just double the size, bound by the + * upper limit + */ + newsize = GTMProxyThreads->gt_array_size * 2; + } + + /* Can't have more than GTM_PROXY_MAX_THREADS */ + if (newsize > GTM_PROXY_MAX_THREADS) + newsize = GTM_PROXY_MAX_THREADS; + + if (GTMProxyThreads->gt_threads == NULL) + threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize); + else + { + void *old_ptr = GTMProxyThreads->gt_threads; + threads = (GTMProxy_ThreadInfo **)palloc0(sizeof (GTMProxy_ThreadInfo *) * newsize); + memcpy(threads, old_ptr, + GTMProxyThreads->gt_array_size * sizeof (GTMProxy_ThreadInfo *)); + pfree(old_ptr); + } + + GTMProxyThreads->gt_threads = threads; + GTMProxyThreads->gt_array_size = newsize; + } + + /* + * Now that we have free entries in the array, find a free slot and add the + * thrinfo pointer to it. + * + * TODO Optimize this later by tracking few free slots and reusing them. + * The free slots can be updated when a thread exits and reused when a new + * thread is added to the pool. + */ + for (ii = 0; ii < GTMProxyThreads->gt_array_size; ii++) + { + if (GTMProxyThreads->gt_threads[ii] == NULL) + { + GTMProxyThreads->gt_threads[ii] = thrinfo; + GTMProxyThreads->gt_thread_count++; + break; + } + } + GTM_RWLockRelease(>MProxyThreads->gt_lock); + + /* + * Track the slot information in the thrinfo. This is useful to quickly + * find the slot given the thrinfo structure. + */ + thrinfo->thr_localid = ii; + return ii; +} + +int +GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo) +{ + /* + * XXX To be implemeneted + */ + return 0; +} + +/* + * Create a new thread and assign the given connection to it. + * + * This function is responsible for setting up the various memory contextes for + * the thread as well as registering this thread with the Thread Manager. + * + * Upon successful creation, the thread will start running the given + * "startroutine". The thread information is returned to the calling process. + */ +GTMProxy_ThreadInfo * +GTMProxy_ThreadCreate(void *(* startroutine)(void *)) +{ + GTMProxy_ThreadInfo *thrinfo; + int err; + + /* + * We are still running in the context of the main thread. So the + * allocation below would last as long as the main thread exists or the + * memory is explicitely freed. + */ + thrinfo = (GTMProxy_ThreadInfo *)palloc0(sizeof (GTMProxy_ThreadInfo)); + + GTM_MutexLockInit(&thrinfo->thr_lock); + GTM_CVInit(&thrinfo->thr_cv); + + /* + * The thread status is set to GTM_PROXY_THREAD_STARTING and will be changed by + * the thread itself when it actually starts executing + */ + thrinfo->thr_status = GTM_PROXY_THREAD_STARTING; + + /* + * Install the ThreadInfo structure in the global array. We do this before + * starting the thread + */ + if (GTMProxy_ThreadAdd(thrinfo) == -1) + elog(ERROR, "Error starting a new thread"); + + /* + * Set up memory contextes before actually starting the threads + * + * The TopThreadContext is a child of TopMemoryContext and it will last as + * long as the main process or this thread lives + * + * Thread context is not shared between other threads + */ + thrinfo->thr_thread_context = AllocSetContextCreate(TopMemoryContext, + "TopMemoryContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + /* + * Since the thread is not yes started, TopMemoryContext still points to + * the context of the calling thread + */ + thrinfo->thr_parent_context = TopMemoryContext; + + /* + * Each thread gets its own ErrorContext and its a child of ErrorContext of + * the main process + * + * This is a thread-specific context and is not shared between other + * threads + */ + thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext, + "ErrorContext", + 8 * 1024, + 8 * 1024, + 8 * 1024, + false); + + thrinfo->thr_startroutine = startroutine; + + /* + * Now start the thread. The thread will start executing the given + * "startroutine". The thrinfo structure is also passed to the thread. Any + * additional parameters should be passed via the thrinfo strcuture. + * + * Return the thrinfo structure to the caller + */ + if ((err = pthread_create(&thrinfo->thr_id, NULL, GTMProxy_ThreadMainWrapper, + thrinfo))) + ereport(ERROR, + (err, + errmsg("Failed to create a new thread: error %d", err))); + + return thrinfo; +} + +/* + * Exit the current thread + */ +void +GTMProxy_ThreadExit(void) +{ + /* XXX To be implemented */ +} + +int +GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo) +{ + int error; + void *data; + + error = pthread_join(thrinfo->thr_id, &data); + + return error; +} + +/* + * Get thread information for the given thread, identified by the + * thread_id + */ +GTMProxy_ThreadInfo * +GTMProxy_GetThreadInfo(GTM_ThreadID thrid) +{ + + return NULL; +} + +/* + * Cleanup routine for the thread + */ +static void +GTMProxy_ThreadCleanup(void *argp) +{ + GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp; + + elog(LOG, "Cleaning up thread state"); + + /* + * TODO Close the open connection. + */ + StreamClose(thrinfo->thr_conn->con_port->sock); + + /* + * Switch to the memory context of the main process so that we can free up + * our memory contextes easily. + * + * XXX We don't setup cleanup handlers for the main process. So this + * routine would never be called for the main process/thread + */ + MemoryContextSwitchTo(thrinfo->thr_parent_context); + + MemoryContextDelete(thrinfo->thr_message_context); + thrinfo->thr_message_context = NULL; + + MemoryContextDelete(thrinfo->thr_error_context); + thrinfo->thr_error_context = NULL; + + MemoryContextDelete(thrinfo->thr_thread_context); + thrinfo->thr_thread_context = NULL; + + /* + * TODO Now cleanup the thrinfo structure itself and remove it from the global + * array. + */ + + + /* + * Reset the thread-specific information. This should be done only after we + * are sure that memory contextes are not required + * + * Note: elog calls need memory contextes, so no elog calls beyond this + * point. + */ + SetMyThreadInfo(NULL); + + return; +} + +/* + * A wrapper around the start routine of the thread. This helps us doing any + * initialization and setting up cleanup handlers before the main routine is + * started + */ +void * +GTMProxy_ThreadMainWrapper(void *argp) +{ + GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp; + + pthread_detach(thrinfo->thr_id); + + SetMyThreadInfo(thrinfo); + MemoryContextSwitchTo(TopMemoryContext); + + pthread_cleanup_push(GTMProxy_ThreadCleanup, thrinfo); + thrinfo->thr_startroutine(thrinfo); + pthread_cleanup_pop(1); + + return thrinfo; +} + +/* + * Add the given connection info structure to a thread which is selected by a + * round-robin manner. The caller is responsible for only accepting the + * connection. Other things including the authentication is done by the worker + * thread when it finds a new entry in the connection list. + * + * Return the reference to the GTMProxy_ThreadInfo structure of the thread + * which will be serving this connection + */ +GTMProxy_ThreadInfo * +GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo) +{ + GTMProxy_ThreadInfo *thrinfo = NULL; + + /* + * Get the next thread in the queue + */ + GTM_RWLockAcquire(>MProxyThreads->gt_lock, GTM_LOCKMODE_WRITE); + + /* + * Always start with thread 1 because thread 0 is the main thread + */ + if (GTMProxyThreads->gt_next_worker == 0) + GTMProxyThreads->gt_next_worker = 1; + + thrinfo = GTMProxyThreads->gt_threads[GTMProxyThreads->gt_next_worker]; + + /* + * Set the next worker thread before releasing the lock + */ + GTMProxyThreads->gt_next_worker++; + if (GTMProxyThreads->gt_next_worker == GTMProxyThreads->gt_thread_count) + GTMProxyThreads->gt_next_worker = 1; + + GTM_RWLockRelease(>MProxyThreads->gt_lock); + + /* + * Lock the threadninfo structure to safely add the new connection to the + * thread structure. The thread will see the connection when it queries the + * socket descriptor in the next cycle + */ + GTM_MutexLockAcquire(&thrinfo->thr_lock); + + if (thrinfo->thr_conn_count >= GTM_PROXY_MAX_CONNECTIONS) + { + GTM_MutexLockRelease(&thrinfo->thr_lock); + elog(ERROR, "Too many connections"); + } + + /* + * Save the array slotid in the conninfo structure. We send this to the GTM + * server as an identifier which the GTM server sends us back in the + * response. We use that information to route the response back to the + * approrpiate connection + */ + conninfo->con_id = thrinfo->thr_conn_count; + thrinfo->thr_all_conns[thrinfo->thr_conn_count] = conninfo; + thrinfo->thr_conn_count++; + + /* + * Now increment the seqno since a new connection is added to the array. + * Before we do the next poll(), the fd array will be forced to be + * reconstructed. + */ + thrinfo->thr_seqno++; + + /* + * Signal the worker thread if its waiting for connections to be added to + * its Q + * + * XXX May be we can first check the condition that this is the first + * connection in the array and also use signal instead of a bcast since + * only one thread is waiting on the cv. + */ + GTM_CVBcast(&thrinfo->thr_cv); + GTM_MutexLockRelease(&thrinfo->thr_lock); + + return thrinfo; +} + +/* + * Remove the connection from the array and compact the array + */ +int +GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo, GTMProxy_ConnectionInfo *conninfo) +{ + int ii; + + /* + * Lock the threadninfo structure to safely remove the connection from the + * thread structure. + */ + GTM_MutexLockAcquire(&thrinfo->thr_lock); + + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + if (thrinfo->thr_all_conns[ii] == conninfo) + break; + } + + if (ii >= thrinfo->thr_conn_count) + { + GTM_MutexLockRelease(&thrinfo->thr_lock); + elog(ERROR, "No such connection"); + } + + /* + * If this is the last entry in the array ? If not, then copy the last + * entry in this slot and mark the last slot an empty + */ + if ((ii + 1) < thrinfo->thr_conn_count) + { + /* Copy the last entry in this slot */ + thrinfo->thr_all_conns[ii] = thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1]; + + /* Mark the last slot free */ + thrinfo->thr_all_conns[thrinfo->thr_conn_count - 1] = NULL; + + /* Adjust the con_id to reflect the current slot in the array */ + thrinfo->thr_all_conns[ii]->con_id = ii; + } + else + { + /* This is the last entry in the array. Just mark it free */ + thrinfo->thr_all_conns[ii] = NULL; + } + + thrinfo->thr_conn_count--; + + /* + * Increment the seqno to ensure that the next time before we poll, the fd + * array is reconstructed. + */ + thrinfo->thr_seqno++; + GTM_MutexLockRelease(&thrinfo->thr_lock); + + return 0; +} diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h new file mode 100644 index 0000000000..66ca3f12c6 --- /dev/null +++ b/src/include/access/gtm.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * gtm.h + * + * Module interfacing with GTM definitions + * + * + *------------------------------------------------------------------------- + */ +#ifndef ACCESS_GTM_H +#define ACCESS_GTM_H + +#include "gtm/gtm_c.h" + +/* Configuration variables */ +extern char *GtmHost; +extern int GtmPort; +extern int GtmCoordinatorId; + +extern bool IsGTMConnected(void); +extern void InitGTM(void); +extern void CloseGTM(void); +extern GlobalTransactionId BeginTranGTM(void); +extern GlobalTransactionId BeginTranAutovacuumGTM(void); +extern int CommitTranGTM(GlobalTransactionId gxid); +extern int RollbackTranGTM(GlobalTransactionId gxid); +extern GTM_Snapshot GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped); +extern GTM_Sequence GetNextValGTM(char *seqname); +extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment, + GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval, + bool cycle); +extern int DropSequenceGTM(char *seqname); +#endif /* ACCESS_GTM_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index b23a663c53..a7a8230595 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.68 2009/05/08 03:21:35 momjian Exp $ * @@ -152,6 +153,11 @@ extern TransactionId TransactionIdLatest(TransactionId mainxid, extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid); /* in transam/varsup.c */ +#ifdef PGXC /* PGXC_DATANODE */ +extern void SetNextTransactionId(TransactionId xid); +extern void SetForceXidFromGTM(bool value); +extern bool GetForceXidFromGTM(void); +#endif /* PGXC */ extern TransactionId GetNewTransactionId(bool isSubXact); extern TransactionId ReadNewTransactionId(void); extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 880b41b707..7cd8e165ec 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.98 2009/06/11 14:49:09 momjian Exp $ * @@ -18,7 +19,9 @@ #include "nodes/pg_list.h" #include "storage/relfilenode.h" #include "utils/timestamp.h" - +#ifdef PGXC /* PGXC_COORD */ +#include "gtm/gtm_c.h" +#endif /* * Xact isolation levels @@ -145,6 +148,9 @@ extern TransactionId GetTopTransactionId(void); extern TransactionId GetTopTransactionIdIfAny(void); extern TransactionId GetCurrentTransactionId(void); extern TransactionId GetCurrentTransactionIdIfAny(void); +#ifdef PGXC /* PGXC_COORD */ +extern GlobalTransactionId GetCurrentGlobalTransactionId(void); +#endif extern SubTransactionId GetCurrentSubTransactionId(void); extern CommandId GetCurrentCommandId(bool used); extern TimestampTz GetCurrentTransactionStartTimestamp(void); diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index ab549eabb1..e8f96604ad 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.51 2009/01/01 17:23:56 momjian Exp $ * @@ -71,6 +72,9 @@ typedef enum StartupProcess, BgWriterProcess, WalWriterProcess +#ifdef PGXC + ,PoolerProcess +#endif } AuxProcType; #endif /* BOOTSTRAP_H */ diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index fe04aab964..b2af292585 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/catalog/dependency.h,v 1.40 2009/06/11 14:49:09 momjian Exp $ * @@ -146,6 +147,9 @@ typedef enum ObjectClass OCLASS_FDW, /* pg_foreign_data_wrapper */ OCLASS_FOREIGN_SERVER, /* pg_foreign_server */ OCLASS_USER_MAPPING, /* pg_user_mapping */ +#ifdef PGXC + OCLASS_PGXC_CLASS, /* pgxc_class */ +#endif MAX_OCLASS /* MUST BE LAST */ } ObjectClass; diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index 2d6eb3c34a..baa9ecaf49 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.91 2009/06/11 14:49:09 momjian Exp $ * @@ -107,4 +108,11 @@ extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind); extern void CheckAttributeType(const char *attname, Oid atttypid); +#ifdef PGXC +extern void AddRelationDistribution (Oid relid, + DistributeBy *distributeby, + List *parentOids, + TupleDesc descriptor); +#endif + #endif /* HEAP_H */ diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index ce117a8eec..5557021e30 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -7,6 +7,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/catalog/indexing.h,v 1.108 2009/06/11 14:49:09 momjian Exp $ * @@ -267,6 +268,11 @@ DECLARE_UNIQUE_INDEX(pg_user_mapping_oid_index, 174, on pg_user_mapping using bt DECLARE_UNIQUE_INDEX(pg_user_mapping_user_server_index, 175, on pg_user_mapping using btree(umuser oid_ops, umserver oid_ops)); #define UserMappingUserServerIndexId 175 +#ifdef PGXC +DECLARE_UNIQUE_INDEX(pgxc_class_pcrelid_index, 9002, on pgxc_class using btree(pcrelid oid_ops)); +#define PgxcClassPgxcRelIdIndexId 9002 +#endif + /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h new file mode 100644 index 0000000000..2104e53e42 --- /dev/null +++ b/src/include/catalog/pgxc_class.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2004-2007 EnterpriseDB Corporation. All Rights Reserved. + */ +#ifndef PGXC_CLASS_H +#define PGXC_CLASS_H + +#include "nodes/parsenodes.h" + +#define PgxcClassRelationId 9001 + +CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS +{ + Oid pcrelid; + char pclocatortype; + int2 pcattnum; + int2 pchashalgorithm; + int2 pchashbuckets; +} FormData_pgxc_class; + +typedef FormData_pgxc_class *Form_pgxc_class; + +#define Natts_pgxc_class 5 + +#define Anum_pgxc_class_pcrelid 1 +#define Anum_pgxc_class_pclocatortype 2 +#define Anum_pgxc_class_pcattnum 3 +#define Anum_pgxc_class_pchashalgorithm 4 +#define Anum_pgxc_class_pchashbuckets 5 + +extern void PgxcClassCreate(Oid pcrelid, + char pclocatortype, + int pcattnum, + int pchashalgorithm, + int pchashbuckets); + +extern void RemovePgxcClass(Oid pcrelid); + +#endif /* PGXC_CLASS_H */ + diff --git a/src/include/gtm/assert.h b/src/include/gtm/assert.h new file mode 100644 index 0000000000..5c71363832 --- /dev/null +++ b/src/include/gtm/assert.h @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * assert.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_ASSERT_H +#define GTM_ASSERT_H + +extern bool assert_enabled; + +/* + * USE_ASSERT_CHECKING, if defined, turns on all the assertions. + * - plai 9/5/90 + * + * It should _NOT_ be defined in releases or in benchmark copies + */ + +/* + * Trap + * Generates an exception if the given condition is true. + */ +#define Trap(condition, errorType) \ + do { \ + if ((assert_enabled) && (condition)) \ + ExceptionalCondition(CppAsString(condition), (errorType), \ + __FILE__, __LINE__); \ + } while (0) + +/* + * TrapMacro is the same as Trap but it's intended for use in macros: + * + * #define foo(x) (AssertMacro(x != 0) && bar(x)) + * + * Isn't CPP fun? + */ +#define TrapMacro(condition, errorType) \ + ((bool) ((! assert_enabled) || ! (condition) || \ + (ExceptionalCondition(CppAsString(condition), (errorType), \ + __FILE__, __LINE__)))) + +#ifndef USE_ASSERT_CHECKING +#define Assert(condition) +#define AssertMacro(condition) ((void)true) +#define AssertArg(condition) +#define AssertState(condition) +#else +#define Assert(condition) \ + Trap(!(condition), "FailedAssertion") + +#define AssertMacro(condition) \ + ((void) TrapMacro(!(condition), "FailedAssertion")) + +#define AssertArg(condition) \ + Trap(!(condition), "BadArgument") + +#define AssertState(condition) \ + Trap(!(condition), "BadState") +#endif /* USE_ASSERT_CHECKING */ + +extern int ExceptionalCondition(const char *conditionName, + const char *errorType, + const char *fileName, int lineNumber); + +#endif diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h new file mode 100644 index 0000000000..49c463fa3e --- /dev/null +++ b/src/include/gtm/elog.h @@ -0,0 +1,253 @@ +/*------------------------------------------------------------------------- + * + * elog.h + * POSTGRES error reporting/logging definitions. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/utils/elog.h,v 1.98 2009/01/01 17:24:02 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ELOG_H +#define ELOG_H + +/* Error level codes */ +#define DEBUG5 10 /* Debugging messages, in categories of + * decreasing detail. */ +#define DEBUG4 11 +#define DEBUG3 12 +#define DEBUG2 13 +#define DEBUG1 14 /* used by GUC debug_* variables */ +#define LOG 15 /* Server operational messages; sent only to + * server log by default. */ +#define COMMERROR 16 /* Client communication problems; same as LOG + * for server reporting, but never sent to + * client. */ +#define INFO 17 /* Messages specifically requested by user + * (eg VACUUM VERBOSE output); always sent to + * client regardless of client_min_messages, + * but by default not sent to server log. */ +#define NOTICE 18 /* Helpful messages to users about query + * operation; sent to client and server log + * by default. */ +#define WARNING 19 /* Warnings. NOTICE is for expected messages + * like implicit sequence creation by SERIAL. + * WARNING is for unexpected messages. */ +#define ERROR 20 /* user error - abort transaction; return to + * known state */ +#define ERROR2 21 /* user error - only send error message to the + * client */ +#define FATAL 22 /* fatal error - abort process */ +#define PANIC 23 /* take down the other backends with me */ + + /* #define DEBUG DEBUG1 */ /* Backward compatibility with pre-7.3 */ + + +/* Which __func__ symbol do we have, if any? */ +#ifdef HAVE_FUNCNAME__FUNC +#define PG_FUNCNAME_MACRO __func__ +#else +#ifdef HAVE_FUNCNAME__FUNCTION +#define PG_FUNCNAME_MACRO __FUNCTION__ +#else +#define PG_FUNCNAME_MACRO NULL +#endif +#endif + +/* + * ErrorData holds the data accumulated during any one ereport() cycle. + * Any non-NULL pointers must point to palloc'd data. + * (The const pointers are an exception; we assume they point at non-freeable + * constant strings.) + */ +typedef struct ErrorData +{ + int elevel; /* error level */ + bool output_to_server; /* will report to server log? */ + bool output_to_client; /* will report to client? */ + bool show_funcname; /* true to force funcname inclusion */ + const char *filename; /* __FILE__ of ereport() call */ + int lineno; /* __LINE__ of ereport() call */ + const char *funcname; /* __func__ of ereport() call */ + const char *domain; /* message domain */ + char *message; /* primary error message */ + char *detail; /* detail error message */ + char *detail_log; /* detail error message for server log only */ + char *hint; /* hint message */ + char *context; /* context message */ + int saved_errno; /* errno at entry */ +} ErrorData; + + +/*---------- + * New-style error reporting API: to be used in this way: + * ereport(ERROR, + * (errcode(ERRCODE_UNDEFINED_CURSOR), + * errmsg("portal \"%s\" not found", stmt->portalname), + * ... other errxxx() fields as needed ...)); + * + * The error level is required, and so is a primary error message (errmsg + * or errmsg_internal). All else is optional. errcode() defaults to + * ERRCODE_INTERNAL_ERROR if elevel is ERROR or more, ERRCODE_WARNING + * if elevel is WARNING, or ERRCODE_SUCCESSFUL_COMPLETION if elevel is + * NOTICE or below. + * + * ereport_domain() allows a message domain to be specified, for modules that + * wish to use a different message catalog from the backend's. To avoid having + * one copy of the default text domain per .o file, we define it as NULL here + * and have errstart insert the default text domain. Modules can either use + * ereport_domain() directly, or preferably they can override the TEXTDOMAIN + * macro. + *---------- + */ +#define TEXTDOMAIN "GTM" + +#define ereport_domain(elevel, domain, rest) \ + (errstart(elevel, __FILE__, __LINE__, PG_FUNCNAME_MACRO, domain) ? \ + (errfinish rest) : (void) 0) + +#define ereport(level, rest) \ + ereport_domain(level, TEXTDOMAIN, rest) + + +#define PG_RE_THROW() pg_re_throw() + +extern bool errstart(int elevel, const char *filename, int lineno, + const char *funcname, const char *domain); +extern void errfinish(int dummy,...); + +extern int +errmsg(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +extern int +errmsg_internal(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +extern int +errdetail(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +extern int +errdetail_log(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +extern int +errhint(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +/*---------- + * Old-style error reporting API: to be used in this way: + * elog(ERROR, "portal \"%s\" not found", stmt->portalname); + *---------- + */ +#define elog elog_start(__FILE__, __LINE__, PG_FUNCNAME_MACRO), elog_finish + +extern void elog_start(const char *filename, int lineno, const char *funcname); +extern void +elog_finish(int elevel, const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 2, 3))); + +/*---------- + * API for catching ereport(ERROR) exits. Use these macros like so: + * + * PG_TRY(); + * { + * ... code that might throw ereport(ERROR) ... + * } + * PG_CATCH(); + * { + * ... error recovery code ... + * } + * PG_END_TRY(); + * + * (The braces are not actually necessary, but are recommended so that + * pg_indent will indent the construct nicely.) The error recovery code + * can optionally do PG_RE_THROW() to propagate the same error outwards. + * + * Note: while the system will correctly propagate any new ereport(ERROR) + * occurring in the recovery section, there is a small limit on the number + * of levels this will work for. It's best to keep the error recovery + * section simple enough that it can't generate any new errors, at least + * not before popping the error stack. + * + * Note: an ereport(FATAL) will not be caught by this construct; control will + * exit straight through proc_exit(). Therefore, do NOT put any cleanup + * of non-process-local resources into the error recovery section, at least + * not without taking thought for what will happen during ereport(FATAL). + * The PG_ENSURE_ERROR_CLEANUP macros provided by storage/ipc.h may be + * helpful in such cases. + *---------- + */ +#define PG_TRY() \ + do { \ + sigjmp_buf *save_exception_stack = PG_exception_stack; \ + sigjmp_buf local_sigjmp_buf; \ + if (sigsetjmp(local_sigjmp_buf, 0) == 0) \ + { \ + PG_exception_stack = &local_sigjmp_buf + +#define PG_CATCH() \ + } \ + else \ + { \ + PG_exception_stack = save_exception_stack; \ + +#define PG_END_TRY() \ + } \ + PG_exception_stack = save_exception_stack; \ + } while (0) + +int errfunction(const char *funcname); + +extern void EmitErrorReport(void *port); + +/* GUC-configurable parameters */ + +typedef enum +{ + PGERROR_TERSE, /* single-line error messages */ + PGERROR_DEFAULT, /* recommended style */ + PGERROR_VERBOSE /* all the facts, ma'am */ +} PGErrorVerbosity; + +/* Log destination bitmap */ +#define LOG_DESTINATION_STDERR 1 +#define LOG_DESTINATION_SYSLOG 2 +#define LOG_DESTINATION_EVENTLOG 4 +#define LOG_DESTINATION_CSVLOG 8 + +/* Other exported functions */ +extern void pg_re_throw(void); +extern void DebugFileOpen(void); +extern void FlushErrorState(void); + + +/* + * Write errors to stderr (or by equal means when stderr is + * not available). Used before ereport/elog can be used + * safely (memory context, GUC load etc) + */ +extern void +write_stderr(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); + +#endif /* ELOG_H */ diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h new file mode 100644 index 0000000000..37e23a7ffa --- /dev/null +++ b/src/include/gtm/gtm.h @@ -0,0 +1,140 @@ +/*------------------------------------------------------------------------- + * + * gtm.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef _GTM_H +#define _GTM_H + +#include <setjmp.h> + +#include "gtm/gtm_c.h" +#include "gtm/palloc.h" +#include "gtm/gtm_lock.h" +#include "gtm/gtm_conn.h" +#include "gtm/elog.h" +#include "gtm/gtm_list.h" + +extern char *GTMLogFile; + +typedef enum GTM_ThreadStatus +{ + GTM_THREAD_STARTING, + GTM_THREAD_RUNNING, + GTM_THREAD_EXITING, + /* Must be the last */ + GTM_THREAD_INVALID +} GTM_ThreadStatus; + +struct GTM_ConnectionInfo; + +#define ERRORDATA_STACK_SIZE 5 + +typedef struct GTM_ThreadInfo +{ + /* + * Thread specific information such as connection(s) served by it + */ + GTM_ThreadID thr_id; + uint32 thr_localid; + void * (* thr_startroutine)(void *); + + MemoryContext thr_thread_context; + MemoryContext thr_message_context; + MemoryContext thr_current_context; + MemoryContext thr_error_context; + MemoryContext thr_parent_context; + + sigjmp_buf *thr_sigjmp_buf; + + ErrorData thr_error_data[ERRORDATA_STACK_SIZE]; + int thr_error_stack_depth; + int thr_error_recursion_depth; + int thr_criticalsec_count; + + GTM_ThreadStatus thr_status; + GTM_ConnectionInfo *thr_conn; + + GTM_RWLock thr_lock; + List *thr_cached_txninfo; + +} GTM_ThreadInfo; + +typedef struct GTM_Threads +{ + uint32 gt_thread_count; + uint32 gt_array_size; + GTM_ThreadInfo **gt_threads; + GTM_RWLock gt_lock; +} GTM_Threads; + +extern GTM_Threads *GTMThreads; + +int GTM_ThreadAdd(GTM_ThreadInfo *thrinfo); +int GTM_ThreadRemove(GTM_ThreadInfo *thrinfo); +int GTM_ThreadJoin(GTM_ThreadInfo *thrinfo); +void GTM_ThreadExit(void); +void ConnFree(Port *port); + +GTM_ThreadInfo *GTM_ThreadCreate(GTM_ConnectionInfo *conninfo, + void *(* startroutine)(void *)); +GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid); + +/* + * pthread keys to get thread specific information + */ +extern pthread_key_t threadinfo_key; +extern MemoryContext TopMostMemoryContext; + +#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo)) +#define GetMyThreadInfo ((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key)) + +#define TopMemoryContext (GetMyThreadInfo->thr_thread_context) +#define ThreadTopContext (GetMyThreadInfo->thr_thread_context) +#define MessageContext (GetMyThreadInfo->thr_message_context) +#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context) +#define ErrorContext (GetMyThreadInfo->thr_error_context) +#define errordata (GetMyThreadInfo->thr_error_data) +#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth) +#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth) +#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count) + +#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf) +#define MyConnection (GetMyThreadInfo->thr_conn) +#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \ + GetMyThreadInfo->thr_conn->con_port : \ + NULL) +#define MyThreadID (GetMyThreadInfo->thr_id) + +#define GTM_CachedTransInfo (GetMyThreadInfo->thr_cached_txninfo) +#define GTM_HaveFreeCachedTransInfo() (list_length(GTM_CachedTransInfo)) + +#define GTM_MAX_CACHED_TRANSINFO 0 +#define GTM_HaveEnoughCachedTransInfo() (list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO) + +#define START_CRIT_SECTION() (CritSectionCount++) + +#define END_CRIT_SECTION() \ + do { \ + Assert(CritSectionCount > 0); \ + CritSectionCount--; \ + } while(0) + + +#if 0 + +/* Coordinator registration */ +int GTM_RegisterCoordinator(GTM_CoordInfo *cinfo); +int GTM_UnregisterCoordinator(GTM_CoordinatorId cid); + +#endif + +#endif diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h new file mode 100644 index 0000000000..1a04064b6d --- /dev/null +++ b/src/include/gtm/gtm_c.h @@ -0,0 +1,101 @@ +/*------------------------------------------------------------------------- + * + * c.h + * Fundamental C definitions. This is included by every .c file in + * PostgreSQL (via either postgres.h or postgres_fe.h, as appropriate). + * + * Note that the definitions here are not intended to be exposed to clients + * of the frontend interface libraries --- so we don't worry much about + * polluting the namespace with lots of stuff... + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/c.h,v 1.234 2009/01/01 17:23:55 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_C_H +#define GTM_C_H + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stddef.h> +#include <stdarg.h> +#ifdef HAVE_STRINGS_H +#include <strings.h> +#endif +#include <sys/types.h> + +#include <errno.h> +#include <pthread.h> +#include "c.h" + +typedef uint32 GlobalTransactionId; /* 32-bit global transaction ids */ +typedef uint32 PGXC_NodeId; +typedef uint32 GTM_CoordinatorId; +typedef int16 GTMProxy_ConnID; + +#define InvalidGTMProxyConnID -1 + +typedef pthread_t GTM_ThreadID; + +/* + * A unique handle to identify transaction at the GTM. It could just be + * an index in an array or a pointer to the structure + * + * Note: If we get rid of BEGIN transaction at the GTM, we can use GXID + * as a handle because we would never have a transaction state at the + * GTM without assigned GXID. + */ +typedef int32 GTM_TransactionHandle; + +#define InvalidTransactionHandle -1 + +typedef int64 GTM_Sequence; /* a 64-bit sequence */ +typedef struct GTM_SequenceKeyData +{ + uint32 gsk_keylen; + char *gsk_key; +} GTM_SequenceKeyData; /* Counter key, set by the client */ + +typedef GTM_SequenceKeyData *GTM_SequenceKey; +#define GTM_MAX_SEQKEY_LENGTH 1024 + +#define InvalidSequenceValue 0x7fffffffffffffffLL +#define SEQVAL_IS_VALID(v) ((v) != InvalidSequenceValue) + +#define GTM_MAX_GLOBAL_TRANSACTIONS 4096 + +typedef enum GTM_IsolationLevel +{ + GTM_ISOLATION_SERIALIZABLE, /* serializable txn */ + GTM_ISOLATION_RC /* read-committed txn */ +} GTM_IsolationLevel; + +typedef struct GTM_SnapshotData +{ + GlobalTransactionId sn_xmin; + GlobalTransactionId sn_xmax; + GlobalTransactionId sn_recent_global_xmin; + uint32 sn_xcnt; + GlobalTransactionId *sn_xip; +} GTM_SnapshotData; + +typedef GTM_SnapshotData *GTM_Snapshot; + +typedef struct GTM_StartupPacket { + GTM_CoordinatorId sp_cid; + bool sp_isproxy; +} GTM_StartupPacket; + +#define InvalidGlobalTransactionId ((GlobalTransactionId) 0) + +#define GlobalTransactionIdIsValid(gxid) ((GlobalTransactionId) (gxid)) != InvalidGlobalTransactionId + +#define _(x) gettext(x) + +#endif /* GTM_C_H */ diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h new file mode 100644 index 0000000000..29eeaf95f9 --- /dev/null +++ b/src/include/gtm/gtm_client.h @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * gtm_client.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_CLIENT_H +#define GTM_CLIENT_H + +#include "gtm/gtm_c.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq-fe.h" + +typedef union GTM_ResultData +{ + GTM_TransactionHandle grd_txnhandle; /* TXN_BEGIN */ + GlobalTransactionId grd_gxid; /* TXN_BEGIN_GETGXID + * TXN_PREPARE + * TXN_COMMIT + * TXN_ROLLBACK + */ + + struct + { + GTM_TransactionHandle txnhandle; + GlobalTransactionId gxid; + } grd_txn; /* TXN_GET_GXID + * SNAPSHOT_GET + * SNAPSHOT_GXID_GET */ + + GTM_SequenceKeyData grd_seqkey; /* SEQUENCE_INIT + * SEQUENCE_RESET + * SEQUENCE_CLOSE */ + struct + { + GTM_SequenceKeyData seqkey; + GTM_Sequence seqval; + } grd_seq; /* SEQUENCE_GET_CURRENT + SEQUENCE_GET_NEXT */ + + struct + { + int txn_count; /* TXN_BEGIN_GETGXID_MULTI */ + GlobalTransactionId start_gxid; + } grd_txn_get_multi; + + struct + { + int txn_count; /* TXN_COMMIT_MULTI */ + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + } grd_txn_rc_multi; + + struct + { + int txn_count; /* GET_SNAPSHOT_MULTI */ + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + } grd_txn_snap_multi; + + /* + * TODO + * TXN_GET_STATUS + * TXN_GET_ALL_PREPARED + */ +} GTM_ResultData; + +typedef struct GTM_Result +{ + GTM_ResultType gr_type; + int gr_msglen; + int gr_status; + GTM_ProxyMsgHeader gr_proxyhdr; + GTM_ResultData gr_resdata; + /* + * We keep these two items outside the union to avoid repeated malloc/free + * of the xip array. If these items are pushed inside the union, they may + * get overwritten by other members in the union + */ + int gr_xip_size; + GTM_SnapshotData gr_snapshot; + + /* + * Similarly, keep the buffer for proxying data outside the union + */ + char *gr_proxy_data; + int gr_proxy_datalen; +} GTM_Result; + +/* + * Connection Management API + */ +GTM_Conn *connect_gtm(const char *connect_string); +void disconnect_gtm(GTM_Conn *conn); + +/* + * Transaction Management API + */ +GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel); +GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel); +int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid); +int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid); +int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid, + int nodecnt, PGXC_NodeId nodes[]); + +/* + * Snapshot Management API + */ +GTM_SnapshotData *get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, + bool canbe_grouped); + +/* + * Sequence Management API + */ +int open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, bool cycle); +int close_sequence(GTM_Conn *conn, GTM_SequenceKey key); +GTM_Sequence get_current(GTM_Conn *conn, GTM_SequenceKey key); +GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key); +int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key); + + +#endif diff --git a/src/include/gtm/gtm_conn.h b/src/include/gtm/gtm_conn.h new file mode 100644 index 0000000000..911a345c4f --- /dev/null +++ b/src/include/gtm/gtm_conn.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * gtm_conn.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_CONN_H +#define GTM_CONN_H + +#include "gtm/libpq-be.h" + +struct GTM_ThreadInfo; + +typedef struct GTM_ConnectionInfo +{ + /* Port contains all the vital information about this connection */ + Port *con_port; + struct GTM_ThreadInfo *con_thrinfo; + bool con_authenticated; +} GTM_ConnectionInfo; + +typedef struct GTM_Connections +{ + uint32 gc_conn_count; + uint32 gc_array_size; + GTM_ConnectionInfo *gc_connections; + GTM_RWLock gc_lock; +} GTM_Connections; + + +#endif diff --git a/src/include/gtm/gtm_ext.h b/src/include/gtm/gtm_ext.h new file mode 100644 index 0000000000..b492941779 --- /dev/null +++ b/src/include/gtm/gtm_ext.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------- + * + * gtm_ext.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_EXT_H +#define GTM_EXT_H + +/* + * Identifiers of error message fields. Kept here to keep common + * between frontend and backend, and also to export them to libpq + * applications. + */ +#define PG_DIAG_SEVERITY 'S' +#define PG_DIAG_MESSAGE_PRIMARY 'M' +#define PG_DIAG_MESSAGE_DETAIL 'D' +#define PG_DIAG_MESSAGE_HINT 'H' +#define PG_DIAG_SOURCE_FILE 'F' +#define PG_DIAG_SOURCE_LINE 'L' +#define PG_DIAG_SOURCE_FUNCTION 'R' + + +#endif diff --git a/src/include/gtm/gtm_ip.h b/src/include/gtm/gtm_ip.h new file mode 100644 index 0000000000..30da3081d3 --- /dev/null +++ b/src/include/gtm/gtm_ip.h @@ -0,0 +1,50 @@ +/*------------------------------------------------------------------------- + * + * ip.h + * Definitions for IPv6-aware network access. + * + * These definitions are used by both frontend and backend code. Be careful + * what you include here! + * + * Copyright (c) 2003-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef IP_H +#define IP_H + +#include "gtm/pqcomm.h" + + +extern int gtm_getaddrinfo_all(const char *hostname, const char *servname, + const struct addrinfo * hintp, + struct addrinfo ** result); +extern void gtm_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai); + +extern int gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags); + +extern int gtm_range_sockaddr(const struct sockaddr_storage * addr, + const struct sockaddr_storage * netaddr, + const struct sockaddr_storage * netmask); + +extern int gtm_sockaddr_cidr_mask(struct sockaddr_storage * mask, + char *numbits, int family); + +#ifdef HAVE_IPV6 +extern void gtm_promote_v4_to_v6_addr(struct sockaddr_storage * addr); +extern void gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr); +#endif + +#ifdef HAVE_UNIX_SOCKETS +#define IS_AF_UNIX(fam) ((fam) == AF_UNIX) +#else +#define IS_AF_UNIX(fam) (0) +#endif + +#endif /* IP_H */ diff --git a/src/include/gtm/gtm_list.h b/src/include/gtm/gtm_list.h new file mode 100644 index 0000000000..6a5727f36a --- /dev/null +++ b/src/include/gtm/gtm_list.h @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * + * pg_list.h + * interface for PostgreSQL generic linked list package + * + * This package implements singly-linked homogeneous lists. + * + * It is important to have constant-time length, append, and prepend + * operations. To achieve this, we deal with two distinct data + * structures: + * + * 1. A set of "list cells": each cell contains a data field and + * a link to the next cell in the list or NULL. + * 2. A single structure containing metadata about the list: the + * type of the list, pointers to the head and tail cells, and + * the length of the list. + * + * We support three types of lists: + * + * T_List: lists of pointers + * (in practice usually pointers to Nodes, but not always; + * declared as "void *" to minimize casting annoyances) + * T_IntList: lists of integers + * T_OidList: lists of Oids + * + * (At the moment, ints and Oids are the same size, but they may not + * always be so; try to be careful to maintain the distinction.) + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/nodes/pg_list.h,v 1.59 2008/08/14 18:48:00 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_LIST_H +#define GTM_LIST_H + + +typedef struct ListCell ListCell; + +typedef struct List +{ + int length; + ListCell *head; + ListCell *tail; +} List; + +struct ListCell +{ + union + { + void *ptr_value; + int int_value; + } data; + ListCell *next; +}; + +/* + * The *only* valid representation of an empty list is NIL; in other + * words, a non-NIL list is guaranteed to have length >= 1 and + * head/tail != NULL + */ +#define NIL ((List *) NULL) + +/* + * These routines are used frequently. However, we can't implement + * them as macros, since we want to avoid double-evaluation of macro + * arguments. Therefore, we implement them using GCC inline functions, + * and as regular functions with non-GCC compilers. + */ +#ifdef __GNUC__ + +static __inline__ ListCell * +list_head(List *l) +{ + return l ? l->head : NULL; +} + +static __inline__ ListCell * +list_tail(List *l) +{ + return l ? l->tail : NULL; +} + +static __inline__ int +list_length(List *l) +{ + return l ? l->length : 0; +} +#else + +extern ListCell *list_head(List *l); +extern ListCell *list_tail(List *l); +extern int list_length(List *l); +#endif /* __GNUC__ */ + +/* + * NB: There is an unfortunate legacy from a previous incarnation of + * the List API: the macro lfirst() was used to mean "the data in this + * cons cell". To avoid changing every usage of lfirst(), that meaning + * has been kept. As a result, lfirst() takes a ListCell and returns + * the data it contains; to get the data in the first cell of a + * List, use linitial(). Worse, lsecond() is more closely related to + * linitial() than lfirst(): given a List, lsecond() returns the data + * in the second cons cell. + */ + +#define lnext(lc) ((lc)->next) +#define lfirst(lc) ((lc)->data.ptr_value) +#define lfirst_int(lc) ((lc)->data.int_value) + +#define linitial(l) lfirst(list_head(l)) +#define linitial_int(l) lfirst_int(list_head(l)) + +#define lsecond(l) lfirst(lnext(list_head(l))) +#define lsecond_int(l) lfirst_int(lnext(list_head(l))) + +#define lthird(l) lfirst(lnext(lnext(list_head(l)))) +#define lthird_int(l) lfirst_int(lnext(lnext(list_head(l)))) + +#define lfourth(l) lfirst(lnext(lnext(lnext(list_head(l))))) +#define lfourth_int(l) lfirst_int(lnext(lnext(lnext(list_head(l))))) + +#define llast(l) lfirst(list_tail(l)) +#define llast_int(l) lfirst_int(list_tail(l)) + +/* + * Convenience macros for building fixed-length lists + */ +#define list_make1(x1) lcons(x1, NIL) +#define list_make2(x1,x2) lcons(x1, list_make1(x2)) +#define list_make3(x1,x2,x3) lcons(x1, list_make2(x2, x3)) +#define list_make4(x1,x2,x3,x4) lcons(x1, list_make3(x2, x3, x4)) + +#define list_make1_int(x1) lcons_int(x1, NIL) +#define list_make2_int(x1,x2) lcons_int(x1, list_make1_int(x2)) +#define list_make3_int(x1,x2,x3) lcons_int(x1, list_make2_int(x2, x3)) +#define list_make4_int(x1,x2,x3,x4) lcons_int(x1, list_make3_int(x2, x3, x4)) + +/* + * foreach - + * a convenience macro which loops through the list + */ +#define foreach(cell, l) \ + for ((cell) = list_head(l); (cell) != NULL; (cell) = lnext(cell)) + +/* + * for_each_cell - + * a convenience macro which loops through a list starting from a + * specified cell + */ +#define for_each_cell(cell, initcell) \ + for ((cell) = (initcell); (cell) != NULL; (cell) = lnext(cell)) + +/* + * forboth - + * a convenience macro for advancing through two linked lists + * simultaneously. This macro loops through both lists at the same + * time, stopping when either list runs out of elements. Depending + * on the requirements of the call site, it may also be wise to + * assert that the lengths of the two lists are equal. + */ +#define forboth(cell1, list1, cell2, list2) \ + for ((cell1) = list_head(list1), (cell2) = list_head(list2); \ + (cell1) != NULL && (cell2) != NULL; \ + (cell1) = lnext(cell1), (cell2) = lnext(cell2)) + +extern List *lappend(List *list, void *datum); +extern List *lappend_int(List *list, int datum); + +extern ListCell *lappend_cell(List *list, ListCell *prev, void *datum); +extern ListCell *lappend_cell_int(List *list, ListCell *prev, int datum); + +extern List *lcons(void *datum, List *list); +extern List *lcons_int(int datum, List *list); + +extern List *list_concat(List *list1, List *list2); +extern List *list_truncate(List *list, int new_size); + +extern void *list_nth(List *list, int n); +extern int list_nth_int(List *list, int n); + +extern bool list_member(List *list, void *datum); +extern bool list_member_ptr(List *list, void *datum); +extern bool list_member_int(List *list, int datum); + +extern List *list_delete(List *list, void *datum); +extern List *list_delete_ptr(List *list, void *datum); +extern List *list_delete_int(List *list, int datum); +extern List *list_delete_first(List *list); +extern List *list_delete_cell(List *list, ListCell *cell, ListCell *prev); + +extern List *list_union(List *list1, List *list2); +extern List *list_union_ptr(List *list1, List *list2); +extern List *list_union_int(List *list1, List *list2); + +extern List *list_intersection(List *list1, List *list2); +/* currently, there's no need for list_intersection_int etc */ + +extern List *list_difference(List *list1, List *list2); +extern List *list_difference_ptr(List *list1, List *list2); +extern List *list_difference_int(List *list1, List *list2); + +extern List *list_append_unique(List *list, void *datum); +extern List *list_append_unique_ptr(List *list, void *datum); +extern List *list_append_unique_int(List *list, int datum); + +extern List *list_concat_unique(List *list1, List *list2); +extern List *list_concat_unique_ptr(List *list1, List *list2); +extern List *list_concat_unique_int(List *list1, List *list2); + +extern void list_free(List *list); +extern void list_free_deep(List *list); + +extern List *list_copy(List *list); +extern List *list_copy_tail(List *list, int nskip); + +/* + * To ease migration to the new list API, a set of compatibility + * macros are provided that reduce the impact of the list API changes + * as far as possible. Until client code has been rewritten to use the + * new list API, the ENABLE_LIST_COMPAT symbol can be defined before + * including pg_list.h + */ +#ifdef ENABLE_LIST_COMPAT + +#define lfirsti(lc) lfirst_int(lc) + +#define makeList1(x1) list_make1(x1) +#define makeList2(x1, x2) list_make2(x1, x2) +#define makeList3(x1, x2, x3) list_make3(x1, x2, x3) +#define makeList4(x1, x2, x3, x4) list_make4(x1, x2, x3, x4) + +#define makeListi1(x1) list_make1_int(x1) +#define makeListi2(x1, x2) list_make2_int(x1, x2) + +#define lconsi(datum, list) lcons_int(datum, list) + +#define lappendi(list, datum) lappend_int(list, datum) + +#define nconc(l1, l2) list_concat(l1, l2) + +#define nth(n, list) list_nth(list, n) + +#define member(datum, list) list_member(list, datum) +#define ptrMember(datum, list) list_member_ptr(list, datum) +#define intMember(datum, list) list_member_int(list, datum) + +/* + * Note that the old lremove() determined equality via pointer + * comparison, whereas the new list_delete() uses equal(); in order to + * keep the same behavior, we therefore need to map lremove() calls to + * list_delete_ptr() rather than list_delete() + */ +#define lremove(elem, list) list_delete_ptr(list, elem) +#define LispRemove(elem, list) list_delete(list, elem) +#define lremovei(elem, list) list_delete_int(list, elem) + +#define ltruncate(n, list) list_truncate(list, n) + +#define set_union(l1, l2) list_union(l1, l2) +#define set_ptrUnion(l1, l2) list_union_ptr(l1, l2) + +#define set_difference(l1, l2) list_difference(l1, l2) +#define set_ptrDifference(l1, l2) list_difference_ptr(l1, l2) + +#define equali(l1, l2) equal(l1, l2) +#define equalo(l1, l2) equal(l1, l2) + +#define freeList(list) list_free(list) + +#define listCopy(list) list_copy(list) + +extern int length(List *list); +#endif /* ENABLE_LIST_COMPAT */ + +#endif /* GTM_LIST_H */ diff --git a/src/include/gtm/gtm_lock.h b/src/include/gtm/gtm_lock.h new file mode 100644 index 0000000000..f4a5e025ba --- /dev/null +++ b/src/include/gtm/gtm_lock.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * gtm_lock.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#ifndef GTM_LOCK_H +#define GTM_LOCK_H + +#include <pthread.h> + +typedef struct GTM_RWLock +{ + pthread_rwlock_t lk_lock; +} GTM_RWLock; + +typedef struct GTM_MutexLock +{ + pthread_mutex_t lk_lock; +} GTM_MutexLock; + +typedef enum GTM_LockMode +{ + GTM_LOCKMODE_WRITE, + GTM_LOCKMODE_READ +} GTM_LockMode; + +typedef struct GTM_CV +{ + pthread_cond_t cv_condvar; +} GTM_CV; + +extern bool GTM_RWLockAcquire(GTM_RWLock *lock, GTM_LockMode mode); +extern bool GTM_RWLockRelease(GTM_RWLock *lock); +extern int GTM_RWLockInit(GTM_RWLock *lock); +extern int GTM_RWLockDestroy(GTM_RWLock *lock); +extern bool GTM_RWLockConditionalAcquire(GTM_RWLock *lock, GTM_LockMode mode); + +extern bool GTM_MutexLockAcquire(GTM_MutexLock *lock); +extern bool GTM_MutexLockRelease(GTM_MutexLock *lock); +extern int GTM_MutexLockInit(GTM_MutexLock *lock); +extern int GTM_MutexLockDestroy(GTM_MutexLock *lock); +extern bool GTM_MutexLockConditionalAcquire(GTM_MutexLock *lock); + +extern int GTM_CVInit(GTM_CV *cv); +extern int GTM_CVDestroy(GTM_CV *cv); +extern int GTM_CVSignal(GTM_CV *cv); +extern int GTM_CVBcast(GTM_CV *cv); +extern int GTM_CVWait(GTM_CV *cv, GTM_MutexLock *lock); + +#endif diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h new file mode 100644 index 0000000000..cae061437d --- /dev/null +++ b/src/include/gtm/gtm_msg.h @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * gtm_msg.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_MSG_H +#define GTM_MSG_H + +typedef enum GTM_MessageType +{ + MSG_TYPE_INVALID, + MSG_REGISTER_COORD, /* Register a Coordinator with GTM */ + MSG_UNREGISTER_COORD, /* Unregister a Coordinator with GTM */ + MSG_TXN_BEGIN, /* Start a new transaction */ + MSG_TXN_BEGIN_GETGXID, /* Start a new transaction and get GXID */ + MSG_TXN_BEGIN_GETGXID_MULTI, /* Start multiple new transactions and get GXIDs */ + MSG_TXN_PREPARE, /* Prepare a transation for commit */ + MSG_TXN_COMMIT, /* Commit a running or prepared transaction */ + MSG_TXN_COMMIT_MULTI, /* Commit multiple running or prepared transactions */ + MSG_TXN_ROLLBACK, /* Rollback a transaction */ + MSG_TXN_ROLLBACK_MULTI, /* Rollback multiple transactions */ + MSG_TXN_GET_GXID, /* Get a GXID for a transaction */ + MSG_SNAPSHOT_GET, /* Get a global snapshot */ + MSG_SNAPSHOT_GET_MULTI, /* Get multiple global snapshots */ + MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */ + MSG_SEQUENCE_INIT, /* Initialize a new global sequence */ + MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */ + MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */ + MSG_SEQUENCE_RESET, /* Reset the sequence */ + MSG_SEQUENCE_CLOSE, /* Close a previously inited sequence */ + MSG_TXN_GET_STATUS, /* Get status of a given transaction */ + MSG_TXN_GET_ALL_PREPARED, /* Get information about all outstanding + * prepared transactions */ + MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Start a new transaction and get GXID for autovacuum */ + MSG_DATA_FLUSH, /* flush pending data */ + MSG_BACKEND_DISCONNECT, /* tell GTM that the backend diconnected from the proxy */ + + /* + * Must be at the end + */ + MSG_TYPE_COUNT /* A dummmy entry just to count the message types */ +} GTM_MessageType; + +typedef enum GTM_ResultType +{ + TXN_BEGIN_RESULT, + TXN_BEGIN_GETGXID_RESULT, + TXN_BEGIN_GETGXID_MULTI_RESULT, + TXN_PREPARE_RESULT, + TXN_COMMIT_RESULT, + TXN_COMMIT_MULTI_RESULT, + TXN_ROLLBACK_RESULT, + TXN_ROLLBACK_MULTI_RESULT, + TXN_GET_GXID_RESULT, + SNAPSHOT_GET_RESULT, + SNAPSHOT_GET_MULTI_RESULT, + SNAPSHOT_GXID_GET_RESULT, + SEQUENCE_INIT_RESULT, + SEQUENCE_GET_CURRENT_RESULT, + SEQUENCE_GET_NEXT_RESULT, + SEQUENCE_RESET_RESULT, + SEQUENCE_CLOSE_RESULT, + TXN_GET_STATUS_RESULT, + TXN_GET_ALL_PREPARED_RESULT, + TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, +} GTM_ResultType; + +/* + * Special message header for the messgaes exchanged between the GTM server and + * the proxy. + * + * ph_conid: connection identifier which is used to route + * the messages to the right backend. + */ +typedef struct GTM_ProxyMsgHeader +{ + GTMProxy_ConnID ph_conid; +} GTM_ProxyMsgHeader; + +#endif diff --git a/src/include/gtm/gtm_proxy.h b/src/include/gtm/gtm_proxy.h new file mode 100644 index 0000000000..8dc16bca0e --- /dev/null +++ b/src/include/gtm/gtm_proxy.h @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * gtm_proxy.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef _GTM_PROXY_H +#define _GTM_PROXY_H + +#include <setjmp.h> +#include <poll.h> + +#include "gtm/gtm_c.h" +#include "gtm/palloc.h" +#include "gtm/gtm_lock.h" +#include "gtm/gtm_conn.h" +#include "gtm/elog.h" +#include "gtm/gtm_list.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq-fe.h" + +extern char *GTMProxyLogFile; + +typedef enum GTMProxy_ThreadStatus +{ + GTM_PROXY_THREAD_STARTING, + GTM_PROXY_THREAD_RUNNING, + GTM_PROXY_THREAD_EXITING, + /* Must be the last */ + GTM_PROXY_THREAD_INVALID +} GTMProxy_ThreadStatus; + +typedef struct GTMProxy_ConnectionInfo +{ + /* Port contains all the vital information about this connection */ + Port *con_port; + struct GTMProxy_ThreadInfo *con_thrinfo; + bool con_authenticated; + bool con_disconnected; + GTMProxy_ConnID con_id; + + GTM_MessageType con_pending_msg; + GlobalTransactionId con_txid; + GTM_TransactionHandle con_handle; +} GTMProxy_ConnectionInfo; + +typedef struct GTMProxy_Connections +{ + uint32 gc_conn_count; + uint32 gc_array_size; + GTMProxy_ConnectionInfo *gc_connections; + GTM_RWLock gc_lock; +} GTMProxy_Connections; + +#define ERRORDATA_STACK_SIZE 5 +#define GTM_PROXY_MAX_CONNECTIONS 1024 + +typedef struct GTMProxy_ThreadInfo +{ + /* + * Thread specific information such as connection(s) served by it + */ + GTM_ThreadID thr_id; + uint32 thr_localid; + void * (* thr_startroutine)(void *); + + MemoryContext thr_thread_context; + MemoryContext thr_message_context; + MemoryContext thr_current_context; + MemoryContext thr_error_context; + MemoryContext thr_parent_context; + + sigjmp_buf *thr_sigjmp_buf; + + ErrorData thr_error_data[ERRORDATA_STACK_SIZE]; + int thr_error_stack_depth; + int thr_error_recursion_depth; + int thr_criticalsec_count; + + GTMProxy_ThreadStatus thr_status; + GTMProxy_ConnectionInfo *thr_conn; /* Current active */ + + /* + * The structure member type/sequence upto this point must match the + * GTM_ThreadInfo structure in gtm.h since they are shared in some common + * library routines such as elog.c. Keeping them in sync helps us use the + * same library for the proxy as well as the server. + */ + GTM_MutexLock thr_lock; + GTM_CV thr_cv; + + /* + * We use a sequence number to track the state of connection/fd array. + * Whenever a new connection is added or an existing connection is deleted + * from the connection array, the sequence number is incremented. The + * thread main routine can then reconstruct the fd array again. + */ + int32 thr_seqno; + + /* number of connections served by this thread */ + uint32 thr_conn_count; + + /* connection array */ + GTMProxy_ConnectionInfo *thr_all_conns[GTM_PROXY_MAX_CONNECTIONS]; + struct pollfd thr_poll_fds[GTM_PROXY_MAX_CONNECTIONS]; + List *thr_processed_commands; + List *thr_pending_commands[MSG_TYPE_COUNT]; + + GTM_Conn *thr_gtm_conn; + +} GTMProxy_ThreadInfo; + +typedef struct GTMProxy_Threads +{ + uint32 gt_thread_count; + uint32 gt_array_size; + uint32 gt_next_worker; + GTMProxy_ThreadInfo **gt_threads; + GTM_RWLock gt_lock; +} GTMProxy_Threads; + +extern GTMProxy_Threads *GTMProxyThreads; + +int GTMProxy_ThreadAdd(GTMProxy_ThreadInfo *thrinfo); +int GTMProxy_ThreadRemove(GTMProxy_ThreadInfo *thrinfo); +int GTMProxy_ThreadJoin(GTMProxy_ThreadInfo *thrinfo); +void GTMProxy_ThreadExit(void); + +extern GTMProxy_ThreadInfo *GTMProxy_ThreadCreate(void *(* startroutine)(void *)); +extern GTMProxy_ThreadInfo * GTMProxy_GetThreadInfo(GTM_ThreadID thrid); +extern GTMProxy_ThreadInfo *GTMProxy_ThreadAddConnection(GTMProxy_ConnectionInfo *conninfo); +extern int GTMProxy_ThreadRemoveConnection(GTMProxy_ThreadInfo *thrinfo, + GTMProxy_ConnectionInfo *conninfo); + +/* + * Command data - the only relevant information right now is the XID + */ +typedef union GTMProxy_CommandData +{ + struct + { + bool rdonly; + GTM_IsolationLevel iso_level; + } cd_beg; + + struct + { + bool isgxid; + GlobalTransactionId gxid; + GTM_TransactionHandle handle; + } cd_rc; + + struct + { + bool isgxid; + GlobalTransactionId gxid; + GTM_TransactionHandle handle; + } cd_snap; +} GTMProxy_CommandData; + +/* + * Structures to be used for message proxing. There will be one such entry for + * each pending command from a backend. To keep it simple, we have a separate + * entry even if the commands are grouped together. + * + * An array of these entries is maintained which is sorted by the order in + * which the commands are sent to the GTM server. We expect the GTM server to + * respond back in the same order and the sorted array helps us in + * matching/confirming the responses. + */ +typedef struct GTMProxy_CommandInfo +{ + GTM_MessageType ci_mtype; + int ci_res_index; + GTMProxy_CommandData ci_data; + GTMProxy_ConnectionInfo *ci_conn; +} GTMProxy_CommandInfo; + +/* + * pthread keys to get thread specific information + */ +extern pthread_key_t threadinfo_key; +extern MemoryContext TopMostMemoryContext; +extern char *GTMLogFile; + +#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo)) +#define GetMyThreadInfo ((GTMProxy_ThreadInfo *)pthread_getspecific(threadinfo_key)) + +#define TopMemoryContext (GetMyThreadInfo->thr_thread_context) +#define ThreadTopContext (GetMyThreadInfo->thr_thread_context) +#define MessageContext (GetMyThreadInfo->thr_message_context) +#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context) +#define ErrorContext (GetMyThreadInfo->thr_error_context) +#define errordata (GetMyThreadInfo->thr_error_data) +#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth) +#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth) +#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count) + +#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf) +#define MyConnection (GetMyThreadInfo->thr_conn) +#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \ + GetMyThreadInfo->thr_conn->con_port : \ + NULL) +#define MyThreadID (GetMyThreadInfo->thr_id) + +#define START_CRIT_SECTION() (CritSectionCount++) + +#define END_CRIT_SECTION() \ + do { \ + Assert(CritSectionCount > 0); \ + CritSectionCount--; \ + } while(0) + +#endif diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h new file mode 100644 index 0000000000..6cb8cb3027 --- /dev/null +++ b/src/include/gtm/gtm_seq.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * gtm_seq.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef GTM_SEQ_H +#define GTM_SEQ_H + +#include "gtm/stringinfo.h" + +/* Global sequence related structures */ + +typedef struct GTM_SeqInfo +{ + GTM_SequenceKey gs_key; + GTM_Sequence gs_value; + GTM_Sequence gs_init_value; + GTM_Sequence gs_increment_by; + GTM_Sequence gs_min_value; + GTM_Sequence gs_max_value; + bool gs_cycle; + bool gs_called; + + int32 gs_ref_count; + int32 gs_state; + GTM_RWLock gs_lock; +} GTM_SeqInfo; + +#define SEQ_STATE_ACTIVE 1 +#define SEQ_STATE_DELETED 2 + +#define SEQ_IS_ASCENDING(s) ((s)->gs_increment_by > 0) +#define SEQ_IS_CYCLE(s) ((s)->gs_cycle) +#define SEQ_IS_CALLED(s) ((s)->gs_called) + +#define SEQ_DEF_MAX_SEQVAL_ASCEND 0x7ffffffffffffffeLL +#define SEQ_DEF_MIN_SEQVAL_ASCEND 0x1 + +#define SEQ_DEF_MAX_SEQVAL_DESCEND -0x1 +#define SEQ_DEF_MIN_SEQVAL_DESCEND -0x7ffffffffffffffeLL + +#define SEQ_MAX_REFCOUNT 1024 + +/* SEQUENCE Management */ +void GTM_InitSeqManager(void); +int GTM_SeqOpen(GTM_SequenceKey seqkey, + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + bool cycle); +int GTM_SeqClose(GTM_SequenceKey sqkey); +GTM_Sequence GTM_SeqGetNext(GTM_SequenceKey seqkey); +GTM_Sequence GTM_SeqGetCurrent(GTM_SequenceKey seqkey); +int GTM_SeqReset(GTM_SequenceKey seqkey); + + +void ProcessSequenceInitCommand(Port *myport, StringInfo message); +void ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message); +void ProcessSequenceGetNextCommand(Port *myport, StringInfo message); +void ProcessSequenceResetCommand(Port *myport, StringInfo message); +void ProcessSequenceCloseCommand(Port *myport, StringInfo message); + +void GTM_SaveSeqInfo(int ctlfd); +void GTM_RestoreSeqInfo(int ctlfd); + +#endif diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h new file mode 100644 index 0000000000..2d789463f7 --- /dev/null +++ b/src/include/gtm/gtm_txn.h @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * gtm_txn.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef _GTM_TXN_H +#define _GTM_TXN_H + +#include "gtm/gtm_c.h" +#include "gtm/gtm_lock.h" +#include "gtm/gtm_list.h" +#include "gtm/stringinfo.h" + +/* ---------------- + * Special transaction ID values + * + * BootstrapGlobalTransactionId is the XID for "bootstrap" operations, and + * FrozenGlobalTransactionId is used for very old tuples. Both should + * always be considered valid. + * + * FirstNormalGlobalTransactionId is the first "normal" transaction id. + * Note: if you need to change it, you must change pg_class.h as well. + * ---------------- + */ +#define BootstrapGlobalTransactionId ((GlobalTransactionId) 1) +#define FrozenGlobalTransactionId ((GlobalTransactionId) 2) +#define FirstNormalGlobalTransactionId ((GlobalTransactionId) 3) +#define MaxGlobalTransactionId ((GlobalTransactionId) 0xFFFFFFFF) + +/* ---------------- + * transaction ID manipulation macros + * ---------------- + */ +#define GlobalTransactionIdIsNormal(xid) ((xid) >= FirstNormalGlobalTransactionId) +#define GlobalTransactionIdEquals(id1, id2) ((id1) == (id2)) +#define GlobalTransactionIdStore(xid, dest) (*(dest) = (xid)) +#define StoreInvalidGlobalTransactionId(dest) (*(dest) = InvalidGlobalTransactionId) + +/* advance a transaction ID variable, handling wraparound correctly */ +#define GlobalTransactionIdAdvance(dest) \ + do { \ + (dest)++; \ + if ((dest) < FirstNormalGlobalTransactionId) \ + (dest) = FirstNormalGlobalTransactionId; \ + } while(0) + +/* back up a transaction ID variable, handling wraparound correctly */ +#define GlobalTransactionIdRetreat(dest) \ + do { \ + (dest)--; \ + } while ((dest) < FirstNormalGlobalTransactionId) + +typedef int XidStatus; + +#define TRANSACTION_STATUS_IN_PROGRESS 0x00 +#define TRANSACTION_STATUS_COMMITTED 0x01 +#define TRANSACTION_STATUS_ABORTED 0x02 + +/* + * prototypes for functions in transam/transam.c + */ +extern bool GlobalTransactionIdDidCommit(GlobalTransactionId transactionId); +extern bool GlobalTransactionIdDidAbort(GlobalTransactionId transactionId); +extern void GlobalTransactionIdAbort(GlobalTransactionId transactionId); +extern bool GlobalTransactionIdPrecedes(GlobalTransactionId id1, GlobalTransactionId id2); +extern bool GlobalTransactionIdPrecedesOrEquals(GlobalTransactionId id1, GlobalTransactionId id2); +extern bool GlobalTransactionIdFollows(GlobalTransactionId id1, GlobalTransactionId id2); +extern bool GlobalTransactionIdFollowsOrEquals(GlobalTransactionId id1, GlobalTransactionId id2); + +/* in transam/varsup.c */ +extern GlobalTransactionId GTM_GetGlobalTransactionId(GTM_TransactionHandle handle); +extern GlobalTransactionId GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count); +extern GlobalTransactionId ReadNewGlobalTransactionId(void); +extern void SetGlobalTransactionIdLimit(GlobalTransactionId oldest_datfrozenxid); +extern void SetNextGlobalTransactionId(GlobalTransactionId gxid); +extern void GTM_SetShuttingDown(void); + +typedef enum GTM_States +{ + GTM_STARTING, + GTM_RUNNING, + GTM_SHUTTING_DOWN +} GTM_States; + +/* Global transaction states at the GTM */ +typedef enum GTM_TransactionStates +{ + GTM_TXN_STARTING, + GTM_TXN_IN_PROGRESS, + GTM_TXN_PREPARE_IN_PROGRESS, + GTM_TXN_PREPARED, + GTM_TXN_COMMIT_IN_PROGRESS, + GTM_TXN_COMMITTED, + GTM_TXN_ABORT_IN_PROGRESS, + GTM_TXN_ABORTED +} GTM_TransactionStates; + +typedef struct GTM_TransactionInfo +{ + GTM_TransactionHandle gti_handle; + GTM_ThreadID gti_thread_id; + + bool gti_in_use; + GlobalTransactionId gti_gxid; + GTM_TransactionStates gti_state; + PGXC_NodeId gti_coordid; + GlobalTransactionId gti_xmin; + GTM_IsolationLevel gti_isolevel; + bool gti_readonly; + GTMProxy_ConnID gti_backend_id; + uint32 gti_nodecount; + PGXC_NodeId *gti_nodes; + + GTM_SnapshotData gti_current_snapshot; + bool gti_snapshot_set; + + GTM_RWLock gti_lock; + bool gti_vacuum; +} GTM_TransactionInfo; + +#define GTM_MAX_2PC_NODES 16 +#define GTM_CheckTransactionHandle(x) ((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS) +#define GTM_IsTransSerializable(x) ((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE) + +typedef struct GTM_Transactions +{ + uint32 gt_txn_count; + GTM_States gt_gtm_state; + + GTM_RWLock gt_XidGenLock; + + /* + * These fields are protected by XidGenLock + */ + GlobalTransactionId gt_nextXid; /* next XID to assign */ + + GlobalTransactionId gt_oldestXid; /* cluster-wide minimum datfrozenxid */ + GlobalTransactionId gt_xidVacLimit; /* start forcing autovacuums here */ + GlobalTransactionId gt_xidWarnLimit; /* start complaining here */ + GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */ + GlobalTransactionId gt_xidWrapLimit; /* where the world ends */ + + /* + * These fields are protected by TransArrayLock. + */ + GlobalTransactionId gt_latestCompletedXid; /* newest XID that has committed or + * aborted */ + + GlobalTransactionId gt_recent_global_xmin; + + int32 gt_lastslot; + GTM_TransactionInfo gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS]; + List *gt_open_transactions; + + GTM_RWLock gt_TransArrayLock; +} GTM_Transactions; + +extern GTM_Transactions GTMTransactions; + +#define GTM_CountOpenTransactions() (list_length(GTMTransactions.gt_open_transactions)) + +/* + * Two hash tables will be maintained to quickly find the + * GTM_TransactionInfo block given either the GXID or the GTM_TransactionHandle. + */ + +GTM_TransactionInfo *GTM_HandleToTransactionInfo(GTM_TransactionHandle handle); +GTM_TransactionHandle GTM_GXIDToHandle(GlobalTransactionId gxid); + +/* Transaction Control */ +void GTM_InitTxnManager(void); +GTM_TransactionHandle GTM_BeginTransaction(GTM_CoordinatorId coord_id, + GTM_IsolationLevel isolevel, + bool readonly); +int GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, + GTM_IsolationLevel isolevel[], + bool readonly[], + GTMProxy_ConnID connid[], + int txn_count, + GTM_TransactionHandle txns[]); +int GTM_RollbackTransaction(GTM_TransactionHandle txn); +int GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]); +int GTM_RollbackTransactionGXID(GlobalTransactionId gxid); +int GTM_CommitTransaction(GTM_TransactionHandle txn); +int GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]); +int GTM_CommitTransactionGXID(GlobalTransactionId gxid); +int GTM_PrepareTransaction(GTM_TransactionHandle txn, + uint32 nodecnt, + PGXC_NodeId nodes[]); +int GTM_PrepareTransactionGXID(GlobalTransactionId gxid, + uint32 nodecnt, + PGXC_NodeId nodes[]); +uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt); +GTM_TransactionStates GTM_GetStatus(GTM_TransactionHandle txn); +GTM_TransactionStates GTM_GetStatusGXID(GlobalTransactionId gxid); +int GTM_GetAllTransactions(GTM_TransactionInfo txninfo[], uint32 txncnt); +void GTM_RemoveAllTransInfos(int backend_id); + +GTM_Snapshot GTM_GetSnapshotData(GTM_TransactionInfo *my_txninfo, + GTM_Snapshot snapshot); +GTM_Snapshot GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], + int txn_count, int *status); +void GTM_FreeCachedTransInfo(void); + +void ProcessBeginTransactionCommand(Port *myport, StringInfo message); +void ProcessBeginTransactionCommandMulti(Port *myport, StringInfo message); +void ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message); +void ProcessCommitTransactionCommand(Port *myport, StringInfo message); +void ProcessRollbackTransactionCommand(Port *myport, StringInfo message); +void ProcessPrepareTransactionCommand(Port *myport, StringInfo message); +void ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message); + +void ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message); +void ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message); +void ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message); +void ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) ; + +void GTM_SaveTxnInfo(int ctlfd); +void GTM_RestoreTxnInfo(int ctlfd, GlobalTransactionId next_gxid); + +/* + * In gtm_snap.c + */ +void ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid); +void ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message); +void GTM_FreeSnapshotData(GTM_Snapshot snapshot); +#endif diff --git a/src/include/gtm/ip.h b/src/include/gtm/ip.h new file mode 100644 index 0000000000..c5d975298b --- /dev/null +++ b/src/include/gtm/ip.h @@ -0,0 +1,50 @@ +/*------------------------------------------------------------------------- + * + * ip.h + * Definitions for IPv6-aware network access. + * + * These definitions are used by both frontend and backend code. Be careful + * what you include here! + * + * Copyright (c) 2003-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/ip.h,v 1.20 2008/01/01 19:45:58 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef IP_H +#define IP_H + +#include "gtm/pqcomm.h" + + +extern int pg_getaddrinfo_all(const char *hostname, const char *servname, + const struct addrinfo * hintp, + struct addrinfo ** result); +extern void pg_freeaddrinfo_all(int hint_ai_family, struct addrinfo * ai); + +extern int pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags); + +extern int pg_range_sockaddr(const struct sockaddr_storage * addr, + const struct sockaddr_storage * netaddr, + const struct sockaddr_storage * netmask); + +extern int pg_sockaddr_cidr_mask(struct sockaddr_storage * mask, + char *numbits, int family); + +#ifdef HAVE_IPV6 +extern void pg_promote_v4_to_v6_addr(struct sockaddr_storage * addr); +extern void pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr); +#endif + +#ifdef HAVE_UNIX_SOCKETS +#define IS_AF_UNIX(fam) ((fam) == AF_UNIX) +#else +#define IS_AF_UNIX(fam) (0) +#endif + +#endif /* IP_H */ diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h new file mode 100644 index 0000000000..0a795def67 --- /dev/null +++ b/src/include/gtm/libpq-be.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * libpq_be.h + * This file contains definitions for structures and externs used + * by the postmaster during client authentication. + * + * Note that this is backend-internal and is NOT exported to clients. + * Structs that need to be client-visible are in pqcomm.h. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/libpq-be.h,v 1.69 2009/01/01 17:23:59 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef LIBPQ_BE_H +#define LIBPQ_BE_H + +#ifdef HAVE_SYS_TIME_H +#include <sys/time.h> +#endif +#ifdef HAVE_NETINET_TCP_H +#include <netinet/tcp.h> +#endif + +#include "gtm/pqcomm.h" + +/* + * This is used by the postmaster in its communication with frontends. It + * contains all state information needed during this communication before the + * backend is run. The Port structure is kept in malloc'd memory and is + * still available when a backend is running (see MyProcPort). The data + * it points to must also be malloc'd, or else palloc'd in TopMostMemoryContext, + * so that it survives into GTM_ThreadMain execution! + */ + +typedef struct Port +{ + int sock; /* File descriptor */ + SockAddr laddr; /* local addr (postmaster) */ + SockAddr raddr; /* remote addr (client) */ + char *remote_host; /* name (or ip addr) of remote host */ + char *remote_port; /* text rep of remote port */ + + GTMProxy_ConnID conn_id; /* RequestID of this command */ + + GTM_CoordinatorId coordinator_id; /* Coordinator ID */ + bool is_proxy; /* Is this a connection from GTM proxy ? */ +#define PQ_BUFFER_SIZE 8192 + + char PqSendBuffer[PQ_BUFFER_SIZE]; + int PqSendPointer; /* Next index to store a byte in PqSendBuffer */ + + char PqRecvBuffer[PQ_BUFFER_SIZE]; + int PqRecvPointer; /* Next index to read a byte from PqRecvBuffer */ + int PqRecvLength; /* End of data available in PqRecvBuffer */ + + /* + * TCP keepalive settings. + * + * default values are 0 if AF_UNIX or not yet known; current values are 0 + * if AF_UNIX or using the default. Also, -1 in a default value means we + * were unable to find out the default (getsockopt failed). + */ + int default_keepalives_idle; + int default_keepalives_interval; + int default_keepalives_count; + int keepalives_idle; + int keepalives_interval; + int keepalives_count; +} Port; + +/* TCP keepalives configuration. These are no-ops on an AF_UNIX socket. */ + +extern int pq_getkeepalivesidle(Port *port); +extern int pq_getkeepalivesinterval(Port *port); +extern int pq_getkeepalivescount(Port *port); + +extern int pq_setkeepalivesidle(int idle, Port *port); +extern int pq_setkeepalivesinterval(int interval, Port *port); +extern int pq_setkeepalivescount(int count, Port *port); + +#endif /* LIBPQ_BE_H */ diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h new file mode 100644 index 0000000000..2c5c2c4e04 --- /dev/null +++ b/src/include/gtm/libpq-fe.h @@ -0,0 +1,138 @@ +/*------------------------------------------------------------------------- + * + * libpq-fe.h + * This file contains definitions for structures and + * externs for functions used by frontend postgres applications. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-fe.h,v 1.145 2009/01/01 17:24:03 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + +#ifndef LIBPQ_FE_H +#define LIBPQ_FE_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include <stdio.h> + +/* + * postgres_ext.h defines the backend's externally visible types, + * such as Oid. + */ +#include "gtm/gtm_ext.h" + +/* + * Option flags for PQcopyResult + */ +#define PG_COPYRES_ATTRS 0x01 +#define PG_COPYRES_TUPLES 0x02 /* Implies PG_COPYRES_ATTRS */ +#define PG_COPYRES_EVENTS 0x04 +#define PG_COPYRES_NOTICEHOOKS 0x08 + +/* Application-visible enum types */ + +typedef enum +{ + /* + * Although it is okay to add to this list, values which become unused + * should never be removed, nor should constants be redefined - that would + * break compatibility with existing code. + */ + CONNECTION_OK, + CONNECTION_BAD, + /* Non-blocking mode only below here */ + + /* + * The existence of these should never be relied upon - they should only + * be used for user feedback or similar purposes. + */ + CONNECTION_STARTED, /* Waiting for connection to be made. */ + CONNECTION_MADE, /* Connection OK; waiting to send. */ + CONNECTION_AWAITING_RESPONSE, /* Waiting for a response from the + * postmaster. */ + CONNECTION_AUTH_OK, /* Received authentication; waiting for + * backend startup. */ + CONNECTION_SETENV, /* Negotiating environment. */ + CONNECTION_SSL_STARTUP, /* Negotiating SSL. */ + CONNECTION_NEEDED /* Internal state: connect() needed */ +} ConnStatusType; + +typedef enum +{ + PGRES_POLLING_FAILED = 0, + PGRES_POLLING_READING, /* These two indicate that one may */ + PGRES_POLLING_WRITING, /* use select before polling again. */ + PGRES_POLLING_OK, + PGRES_POLLING_ACTIVE /* unused; keep for awhile for backwards + * compatibility */ +} GTMClientPollingStatusType; + +/* ---------------- + * Structure for the conninfo parameter definitions returned by PQconndefaults + * or GTMPQconninfoParse. + * + * All fields except "val" point at static strings which must not be altered. + * "val" is either NULL or a malloc'd current-value string. GTMPQconninfoFree() + * will release both the val strings and the GTMPQconninfoOption array itself. + * ---------------- + */ +typedef struct _GTMPQconninfoOption +{ + char *keyword; /* The keyword of the option */ + char *val; /* Option's current value, or NULL */ +} GTMPQconninfoOption; + +typedef struct gtm_conn GTM_Conn; + +/* ---------------- + * Exported functions of libpq + * ---------------- + */ + +/* === in fe-connect.c === */ + +/* make a new client connection to the backend */ +/* Asynchronous (non-blocking) */ +extern GTM_Conn *PQconnectGTMStart(const char *conninfo); +extern GTMClientPollingStatusType GTMPQconnectPoll(GTM_Conn *conn); + +/* Synchronous (blocking) */ +extern GTM_Conn *PQconnectGTM(const char *conninfo); + +/* close the current connection and free the GTM_Conn data structure */ +extern void GTMPQfinish(GTM_Conn *conn); + +/* parse connection options in same way as PQconnectGTM */ +extern GTMPQconninfoOption *GTMPQconninfoParse(const char *conninfo, char **errmsg); + +/* free the data structure returned by PQconndefaults() or GTMPQconninfoParse() */ +extern void GTMPQconninfoFree(GTMPQconninfoOption *connOptions); + +extern char *GTMPQhost(const GTM_Conn *conn); +extern char *GTMPQport(const GTM_Conn *conn); +extern ConnStatusType GTMPQstatus(const GTM_Conn *conn); +extern char *GTMPQerrorMessage(const GTM_Conn *conn); +extern int GTMPQsocket(const GTM_Conn *conn); + +/* Enable/disable tracing */ +extern void GTMPQtrace(GTM_Conn *conn, FILE *debug_port); +extern void GTMPQuntrace(GTM_Conn *conn); + +/* Force the write buffer to be written (or at least try) */ +extern int PQflush(GTM_Conn *conn); + +#define libpq_gettext(x) x + +#ifdef __cplusplus +} +#endif + +#endif /* LIBPQ_FE_H */ diff --git a/src/include/gtm/libpq-int.h b/src/include/gtm/libpq-int.h new file mode 100644 index 0000000000..5956de8ff2 --- /dev/null +++ b/src/include/gtm/libpq-int.h @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * libpq-int.h + * This file contains internal definitions meant to be used only by + * the frontend libpq library, not by applications that call it. + * + * An application can include this file if it wants to bypass the + * official API defined by libpq-fe.h, but code that does so is much + * more likely to break across PostgreSQL releases than code that uses + * only the official API. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/interfaces/libpq/libpq-int.h,v 1.139 2009/01/01 17:24:03 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + +#ifndef LIBPQ_INT_H +#define LIBPQ_INT_H + +#include <time.h> +#include <sys/types.h> +#include <sys/time.h> +#include "gtm/pqcomm.h" +#include "gtm/pqexpbuffer.h" +#include "gtm/gtm_client.h" + +/* + * GTM_Conn stores all the state data associated with a single connection + * to a backend. + */ +struct gtm_conn +{ + /* Saved values of connection options */ + char *pghost; /* the machine on which the server is running */ + char *pghostaddr; /* the IPv4 address of the machine on which + * the server is running, in IPv4 + * numbers-and-dots notation. Takes precedence + * over above. */ + char *pgport; /* the server's communication port */ + char *connect_timeout; /* connection timeout (numeric string) */ + char *coordinator_id; /* coordinator id */ + int is_proxy; /* is this a connection to/from a proxy ? */ + + /* Optional file to write trace info to */ + FILE *Pfdebug; + + /* Status indicators */ + ConnStatusType status; + + /* Connection data */ + int sock; /* Unix FD for socket, -1 if not connected */ + SockAddr laddr; /* Local address */ + SockAddr raddr; /* Remote address */ + + /* Transient state needed while establishing connection */ + struct addrinfo *addrlist; /* list of possible backend addresses */ + struct addrinfo *addr_cur; /* the one currently being tried */ + int addrlist_family; /* needed to know how to free addrlist */ + + /* Buffer for data received from backend and not yet processed */ + char *inBuffer; /* currently allocated buffer */ + int inBufSize; /* allocated size of buffer */ + int inStart; /* offset to first unconsumed data in buffer */ + int inCursor; /* next byte to tentatively consume */ + int inEnd; /* offset to first position after avail data */ + + /* Buffer for data not yet sent to backend */ + char *outBuffer; /* currently allocated buffer */ + int outBufSize; /* allocated size of buffer */ + int outCount; /* number of chars waiting in buffer */ + + /* State for constructing messages in outBuffer */ + int outMsgStart; /* offset to msg start (length word); if -1, + * msg has no length word */ + int outMsgEnd; /* offset to msg end (so far) */ + + /* Buffer for current error message */ + PQExpBufferData errorMessage; /* expansible string */ + + /* Buffer for receiving various parts of messages */ + PQExpBufferData workBuffer; /* expansible string */ + + /* Pointer to the result of last operation */ + GTM_Result *result; +}; + +/* === in fe-misc.c === */ + + /* + * "Get" and "Put" routines return 0 if successful, EOF if not. Note that for + * Get, EOF merely means the buffer is exhausted, not that there is + * necessarily any error. + */ +extern int gtmpqCheckOutBufferSpace(size_t bytes_needed, GTM_Conn *conn); +extern int gtmpqCheckInBufferSpace(size_t bytes_needed, GTM_Conn *conn); +extern int gtmpqGetc(char *result, GTM_Conn *conn); +extern int gtmpqPutc(char c, GTM_Conn *conn); +extern int gtmpqGets(PQExpBuffer buf, GTM_Conn *conn); +extern int gtmpqGets_append(PQExpBuffer buf, GTM_Conn *conn); +extern int gtmpqPuts(const char *s, GTM_Conn *conn); +extern int gtmpqGetnchar(char *s, size_t len, GTM_Conn *conn); +extern int gtmpqPutnchar(const char *s, size_t len, GTM_Conn *conn); +extern int gtmpqGetInt(int *result, size_t bytes, GTM_Conn *conn); +extern int gtmpqPutInt(int value, size_t bytes, GTM_Conn *conn); +extern int gtmpqPutMsgStart(char msg_type, bool force_len, GTM_Conn *conn); +extern int gtmpqPutMsgEnd(GTM_Conn *conn); +extern int gtmpqReadData(GTM_Conn *conn); +extern int gtmpqFlush(GTM_Conn *conn); +extern int gtmpqWait(int forRead, int forWrite, GTM_Conn *conn); +extern int gtmpqWaitTimed(int forRead, int forWrite, GTM_Conn *conn, + time_t finish_time); +extern int gtmpqReadReady(GTM_Conn *conn); +extern int gtmpqWriteReady(GTM_Conn *conn); + +/* + * In fe-protocol.c + */ +GTM_Result * GTMPQgetResult(GTM_Conn *conn); +extern int gtmpqGetError(GTM_Conn *conn, GTM_Result *result); +void gtmpqFreeResultData(GTM_Result *result, bool is_proxy); + +#define SOCK_ERRNO errno +#define SOCK_ERRNO_SET(e) (errno = (e)) + +#endif /* LIBPQ_INT_H */ diff --git a/src/include/gtm/libpq.h b/src/include/gtm/libpq.h new file mode 100644 index 0000000000..29621a43c4 --- /dev/null +++ b/src/include/gtm/libpq.h @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------- + * + * libpq.h + * POSTGRES LIBPQ buffer structure definitions. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/libpq.h,v 1.70 2008/11/20 09:29:36 mha Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef LIBPQ_H +#define LIBPQ_H + +#include <sys/types.h> +#include <netinet/in.h> + +#include "gtm/stringinfo.h" +#include "gtm/libpq-be.h" + +/* + * External functions. + */ + +/* + * prototypes for functions in pqcomm.c + */ +extern int StreamServerPort(int family, char *hostName, + unsigned short portNumber, int ListenSocket[], + int MaxListen); +extern int StreamConnection(int server_fd, Port *port); +extern void StreamClose(int sock); +extern void TouchSocketFile(void); +extern void pq_comm_reset(void); +extern int pq_getbytes(Port *myport, char *s, size_t len); +extern int pq_getstring(Port *myport, StringInfo s); +extern int pq_getmessage(Port *myport, StringInfo s, int maxlen); +extern int pq_getbyte(Port *myport); +extern int pq_peekbyte(Port *myport); +extern int pq_putbytes(Port *myport, const char *s, size_t len); +extern int pq_flush(Port *myport); +extern int pq_putmessage(Port *myport, char msgtype, const char *s, size_t len); + +#endif /* LIBPQ_H */ diff --git a/src/include/gtm/memnodes.h b/src/include/gtm/memnodes.h new file mode 100644 index 0000000000..dea51b2bbd --- /dev/null +++ b/src/include/gtm/memnodes.h @@ -0,0 +1,79 @@ +/*------------------------------------------------------------------------- + * + * memnodes.h + * POSTGRES memory context node definitions. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/nodes/memnodes.h,v 1.34 2008/01/01 19:45:58 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef MEMNODES_H +#define MEMNODES_H + +#include "gtm/gtm_lock.h" + +/* + * MemoryContext + * A logical context in which memory allocations occur. + * + * MemoryContext itself is an abstract type that can have multiple + * implementations, though for now we have only AllocSetContext. + * The function pointers in MemoryContextMethods define one specific + * implementation of MemoryContext --- they are a virtual function table + * in C++ terms. + * + * Note: for largely historical reasons, typedef MemoryContext is a pointer + * to the context struct rather than the struct type itself. + */ + +typedef struct MemoryContextMethods +{ + void *(*alloc) (MemoryContext context, Size size); + /* call this free_p in case someone #define's free() */ + void (*free_p) (MemoryContext context, void *pointer); + void *(*realloc) (MemoryContext context, void *pointer, Size size); + void (*init) (MemoryContext context); + void (*reset) (MemoryContext context); + void (*delete) (MemoryContext context); + Size (*get_chunk_space) (MemoryContext context, void *pointer); + bool (*is_empty) (MemoryContext context); + void (*stats) (MemoryContext context, int level); +#ifdef MEMORY_CONTEXT_CHECKING + void (*check) (MemoryContext context); +#endif +} MemoryContextMethods; + + +typedef struct MemoryContextData +{ + MemoryContextMethods *methods; /* virtual function table */ + MemoryContext parent; /* NULL if no parent (toplevel context) */ + MemoryContext firstchild; /* head of linked list of children */ + MemoryContext nextchild; /* next child of same parent */ + char *name; /* context name (just for debugging) */ + bool is_shared; /* context is shared by threads */ + GTM_RWLock lock; /* lock to protect members if the context is shared */ +} MemoryContextData; + +#define MemoryContextIsShared(context) \ + (((MemoryContextData *)(context))->is_shared) + +#define MemoryContextLock(context) \ + (GTM_RWLockAcquire(&((MemoryContextData *)(context))->lock, GTM_LOCKMODE_WRITE)) +#define MemoryContextUnlock(context) \ + (GTM_RWLockRelease(&((MemoryContextData *)(context))->lock)) +/* + * MemoryContextIsValid + * True iff memory context is valid. + * + * Add new context types to the set accepted by this macro. + */ +#define MemoryContextIsValid(context) \ + ((context) != NULL) + +#endif /* MEMNODES_H */ diff --git a/src/include/gtm/memutils.h b/src/include/gtm/memutils.h new file mode 100644 index 0000000000..5d89995d4d --- /dev/null +++ b/src/include/gtm/memutils.h @@ -0,0 +1,123 @@ +/*------------------------------------------------------------------------- + * + * memutils.h + * This file contains declarations for memory allocation utility + * functions. These are functions that are not quite widely used + * enough to justify going in utils/palloc.h, but are still part + * of the API of the memory management subsystem. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/utils/memutils.h,v 1.64 2008/01/01 19:45:59 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef MEMUTILS_H +#define MEMUTILS_H + +#include "gtm/gtm_c.h" +#include "gtm/palloc.h" +#include "gtm/memnodes.h" + +/* + * MaxAllocSize + * Quasi-arbitrary limit on size of allocations. + * + * Note: + * There is no guarantee that allocations smaller than MaxAllocSize + * will succeed. Allocation requests larger than MaxAllocSize will + * be summarily denied. + * + * XXX This is deliberately chosen to correspond to the limiting size + * of varlena objects under TOAST. See VARATT_MASK_SIZE in postgres.h. + * + * XXX Also, various places in aset.c assume they can compute twice an + * allocation's size without overflow, so beware of raising this. + */ +#define MaxAllocSize ((Size) 0x3fffffff) /* 1 gigabyte - 1 */ + +#define AllocSizeIsValid(size) ((Size) (size) <= MaxAllocSize) + +/* + * All chunks allocated by any memory context manager are required to be + * preceded by a StandardChunkHeader at a spacing of STANDARDCHUNKHEADERSIZE. + * A currently-allocated chunk must contain a backpointer to its owning + * context as well as the allocated size of the chunk. The backpointer is + * used by pfree() and repalloc() to find the context to call. The allocated + * size is not absolutely essential, but it's expected to be needed by any + * reasonable implementation. + */ +typedef struct StandardChunkHeader +{ + MemoryContext context; /* owning context */ + Size size; /* size of data space allocated in chunk */ +#ifdef MEMORY_CONTEXT_CHECKING + /* when debugging memory usage, also store actual requested size */ + Size requested_size; +#endif +} StandardChunkHeader; + +#define STANDARDCHUNKHEADERSIZE MAXALIGN(sizeof(StandardChunkHeader)) + +/* + * Memory-context-type-independent functions in mcxt.c + */ +extern void MemoryContextInit(void); +extern void MemoryContextReset(MemoryContext context); +extern void MemoryContextDelete(MemoryContext context); +extern void MemoryContextResetChildren(MemoryContext context); +extern void MemoryContextDeleteChildren(MemoryContext context); +extern void MemoryContextResetAndDeleteChildren(MemoryContext context); +extern Size GetMemoryChunkSpace(void *pointer); +extern MemoryContext GetMemoryChunkContext(void *pointer); +extern bool MemoryContextIsEmpty(MemoryContext context); +extern void MemoryContextStats(MemoryContext context); + +#ifdef MEMORY_CONTEXT_CHECKING +extern void MemoryContextCheck(MemoryContext context); +#endif +extern bool MemoryContextContains(MemoryContext context, void *pointer); + +/* + * This routine handles the context-type-independent part of memory + * context creation. It's intended to be called from context-type- + * specific creation routines, and noplace else. + */ +extern MemoryContext MemoryContextCreate(Size size, + MemoryContextMethods *methods, + MemoryContext parent, + const char *name); + + +/* + * Memory-context-type-specific functions + */ + +/* aset.c */ +extern MemoryContext AllocSetContextCreate(MemoryContext parent, + const char *name, + Size minContextSize, + Size initBlockSize, + Size maxBlockSize, + bool isShared); + +/* + * Recommended default alloc parameters, suitable for "ordinary" contexts + * that might hold quite a lot of data. + */ +#define ALLOCSET_DEFAULT_MINSIZE 0 +#define ALLOCSET_DEFAULT_INITSIZE (8 * 1024) +#define ALLOCSET_DEFAULT_MAXSIZE (8 * 1024 * 1024) + +/* + * Recommended alloc parameters for "small" contexts that are not expected + * to contain much data (for example, a context to contain a query plan). + */ +#define ALLOCSET_SMALL_MINSIZE 0 +#define ALLOCSET_SMALL_INITSIZE (1 * 1024) +#define ALLOCSET_SMALL_MAXSIZE (8 * 1024) + +#endif /* MEMUTILS_H */ diff --git a/src/include/gtm/palloc.h b/src/include/gtm/palloc.h new file mode 100644 index 0000000000..380e280694 --- /dev/null +++ b/src/include/gtm/palloc.h @@ -0,0 +1,90 @@ +/*------------------------------------------------------------------------- + * + * palloc.h + * POSTGRES memory allocator definitions. + * + * This file contains the basic memory allocation interface that is + * needed by almost every backend module. It is included directly by + * postgres.h, so the definitions here are automatically available + * everywhere. Keep it lean! + * + * Memory allocation occurs within "contexts". Every chunk obtained from + * palloc()/MemoryContextAlloc() is allocated within a specific context. + * The entire contents of a context can be freed easily and quickly by + * resetting or deleting the context --- this is both faster and less + * prone to memory-leakage bugs than releasing chunks individually. + * We organize contexts into context trees to allow fine-grain control + * over chunk lifetime while preserving the certainty that we will free + * everything that should be freed. See utils/mmgr/README for more info. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/utils/palloc.h,v 1.40 2008/06/28 16:45:22 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PALLOC_H +#define PALLOC_H + +/* + * Type MemoryContextData is declared in nodes/memnodes.h. Most users + * of memory allocation should just treat it as an abstract type, so we + * do not provide the struct contents here. + */ +typedef struct MemoryContextData *MemoryContext; + +/* + * Fundamental memory-allocation operations (more are in utils/memutils.h) + */ +extern void *MemoryContextAlloc(MemoryContext context, Size size); +extern void *MemoryContextAllocZero(MemoryContext context, Size size); +extern void *MemoryContextAllocZeroAligned(MemoryContext context, Size size); + +#define palloc(sz) MemoryContextAlloc(CurrentMemoryContext, (sz)) + +#define palloc0(sz) MemoryContextAllocZero(CurrentMemoryContext, (sz)) + +/* + * The result of palloc() is always word-aligned, so we can skip testing + * alignment of the pointer when deciding which MemSet variant to use. + * Note that this variant does not offer any advantage, and should not be + * used, unless its "sz" argument is a compile-time constant; therefore, the + * issue that it evaluates the argument multiple times isn't a problem in + * practice. + */ +#define palloc0fast(sz) \ + ( MemSetTest(0, sz) ? \ + MemoryContextAllocZeroAligned(CurrentMemoryContext, sz) : \ + MemoryContextAllocZero(CurrentMemoryContext, sz) ) + +extern void pfree(void *pointer); + +extern void *repalloc(void *pointer, Size size); + +/* + * MemoryContextSwitchTo can't be a macro in standard C compilers. + * But we can make it an inline function when using GCC. + */ + +extern MemoryContext MemoryContextSwitchTo(MemoryContext context); + +/* + * These are like standard strdup() except the copied string is + * allocated in a context, not with malloc(). + */ +extern char *MemoryContextStrdup(MemoryContext context, const char *string); + +#define pstrdup(str) MemoryContextStrdup(CurrentMemoryContext, (str)) + +extern char *pnstrdup(const char *in, Size len); + +#if defined(WIN32) || defined(__CYGWIN__) +extern void *pgport_palloc(Size sz); +extern char *pgport_pstrdup(const char *str); +extern void pgport_pfree(void *pointer); +#endif + +#endif /* PALLOC_H */ diff --git a/src/include/gtm/path.h b/src/include/gtm/path.h new file mode 100644 index 0000000000..624fd183c9 --- /dev/null +++ b/src/include/gtm/path.h @@ -0,0 +1,16 @@ +/*------------------------------------------------------------------------- + * + * path.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "gtm/gtm_c.h" + +extern void canonicalize_path(char *path); diff --git a/src/include/gtm/pqcomm.h b/src/include/gtm/pqcomm.h new file mode 100644 index 0000000000..cdae6ca284 --- /dev/null +++ b/src/include/gtm/pqcomm.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * pqcomm.h + * Definitions common to frontends and backends. + * + * NOTE: for historical reasons, this does not correspond to pqcomm.c. + * pqcomm.c's routines are declared in libpq.h. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/pqcomm.h,v 1.109 2008/10/28 12:10:44 mha Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PQCOMM_H +#define PQCOMM_H + +#include <sys/socket.h> +#include <netdb.h> +#ifdef HAVE_SYS_UN_H +#include <sys/un.h> +#endif +#include <netinet/in.h> + +typedef struct +{ + struct sockaddr_storage addr; + size_t salen; +} SockAddr; + +/* Configure the UNIX socket location for the well known port. */ + +#define UNIXSOCK_PATH(path, port, sockdir) \ + snprintf(path, sizeof(path), "%s/.s.PGSQL.%d", \ + ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \ + DEFAULT_PGSOCKET_DIR, \ + (port)) + +/* + * Packet lengths are 4 bytes in network byte order. + * + * The initial length is omitted from the packet layouts appearing below. + */ + +typedef uint32 PacketLen; + +/* + * In protocol 3.0 and later, the startup packet length is not fixed, but + * we set an arbitrary limit on it anyway. This is just to prevent simple + * denial-of-service attacks via sending enough data to run the server + * out of memory. + */ +#define MAX_STARTUP_PACKET_LENGTH 10000 + +#endif /* PQCOMM_H */ diff --git a/src/include/gtm/pqexpbuffer.h b/src/include/gtm/pqexpbuffer.h new file mode 100644 index 0000000000..7ae0411423 --- /dev/null +++ b/src/include/gtm/pqexpbuffer.h @@ -0,0 +1,181 @@ +/*------------------------------------------------------------------------- + * + * pqexpbuffer.h + * Declarations/definitions for "PQExpBuffer" functions. + * + * PQExpBuffer provides an indefinitely-extensible string data type. + * It can be used to buffer either ordinary C strings (null-terminated text) + * or arbitrary binary data. All storage is allocated with malloc(). + * + * This module is essentially the same as the backend's StringInfo data type, + * but it is intended for use in frontend libpq and client applications. + * Thus, it does not rely on palloc() nor elog(). + * + * It does rely on vsnprintf(); if configure finds that libc doesn't provide + * a usable vsnprintf(), then a copy of our own implementation of it will + * be linked into libpq. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/interfaces/libpq/pqexpbuffer.h,v 1.21 2008/11/26 16:23:11 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PQEXPBUFFER_H +#define PQEXPBUFFER_H + +/*------------------------- + * PQExpBufferData holds information about an extensible string. + * data is the current buffer for the string (allocated with malloc). + * len is the current string length. There is guaranteed to be + * a terminating '\0' at data[len], although this is not very + * useful when the string holds binary data rather than text. + * maxlen is the allocated size in bytes of 'data', i.e. the maximum + * string size (including the terminating '\0' char) that we can + * currently store in 'data' without having to reallocate + * more space. We must always have maxlen > len. + * + * An exception occurs if we failed to allocate enough memory for the string + * buffer. In that case data points to a statically allocated empty string, + * and len = maxlen = 0. + *------------------------- + */ +typedef struct PQExpBufferData +{ + char *data; + size_t len; + size_t maxlen; +} PQExpBufferData; + +typedef PQExpBufferData *PQExpBuffer; + +/*------------------------ + * Test for a broken (out of memory) PQExpBuffer. + * When a buffer is "broken", all operations except resetting or deleting it + * are no-ops. + *------------------------ + */ +#define PQExpBufferBroken(str) \ + ((str) == NULL || (str)->maxlen == 0) + +/*------------------------ + * Initial size of the data buffer in a PQExpBuffer. + * NB: this must be large enough to hold error messages that might + * be returned by PQrequestCancel(). + *------------------------ + */ +#define INITIAL_EXPBUFFER_SIZE 256 + +/*------------------------ + * There are two ways to create a PQExpBuffer object initially: + * + * PQExpBuffer stringptr = createGTMPQExpBuffer(); + * Both the PQExpBufferData and the data buffer are malloc'd. + * + * PQExpBufferData string; + * initGTMPQExpBuffer(&string); + * The data buffer is malloc'd but the PQExpBufferData is presupplied. + * This is appropriate if the PQExpBufferData is a field of another + * struct. + *------------------------- + */ + +/*------------------------ + * createGTMPQExpBuffer + * Create an empty 'PQExpBufferData' & return a pointer to it. + */ +extern PQExpBuffer createGTMPQExpBuffer(void); + +/*------------------------ + * initGTMPQExpBuffer + * Initialize a PQExpBufferData struct (with previously undefined contents) + * to describe an empty string. + */ +extern void initGTMPQExpBuffer(PQExpBuffer str); + +/*------------------------ + * To destroy a PQExpBuffer, use either: + * + * destroyGTMPQExpBuffer(str); + * free()s both the data buffer and the PQExpBufferData. + * This is the inverse of createGTMPQExpBuffer(). + * + * termGTMPQExpBuffer(str) + * free()s the data buffer but not the PQExpBufferData itself. + * This is the inverse of initGTMPQExpBuffer(). + * + * NOTE: some routines build up a string using PQExpBuffer, and then + * release the PQExpBufferData but return the data string itself to their + * caller. At that point the data string looks like a plain malloc'd + * string. + */ +extern void destroyGTMPQExpBuffer(PQExpBuffer str); +extern void termGTMPQExpBuffer(PQExpBuffer str); + +/*------------------------ + * resetGTMPQExpBuffer + * Reset a PQExpBuffer to empty + * + * Note: if possible, a "broken" PQExpBuffer is returned to normal. + */ +extern void resetGTMPQExpBuffer(PQExpBuffer str); + +/*------------------------ + * enlargeGTMPQExpBuffer + * Make sure there is enough space for 'needed' more bytes in the buffer + * ('needed' does not include the terminating null). + * + * Returns 1 if OK, 0 if failed to enlarge buffer. (In the latter case + * the buffer is left in "broken" state.) + */ +extern int enlargeGTMPQExpBuffer(PQExpBuffer str, size_t needed); + +/*------------------------ + * printfGTMPQExpBuffer + * Format text data under the control of fmt (an sprintf-like format string) + * and insert it into str. More space is allocated to str if necessary. + * This is a convenience routine that does the same thing as + * resetGTMPQExpBuffer() followed by appendGTMPQExpBuffer(). + */ +extern void +printfGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...) +/* This extension allows gcc to check the format string */ +__attribute__((format(printf, 2, 3))); + +/*------------------------ + * appendGTMPQExpBuffer + * Format text data under the control of fmt (an sprintf-like format string) + * and append it to whatever is already in str. More space is allocated + * to str if necessary. This is sort of like a combination of sprintf and + * strcat. + */ +extern void +appendGTMPQExpBuffer(PQExpBuffer str, const char *fmt,...) +/* This extension allows gcc to check the format string */ +__attribute__((format(printf, 2, 3))); + +/*------------------------ + * appendGTMPQExpBufferStr + * Append the given string to a PQExpBuffer, allocating more space + * if necessary. + */ +extern void appendGTMPQExpBufferStr(PQExpBuffer str, const char *data); + +/*------------------------ + * appendGTMPQExpBufferChar + * Append a single byte to str. + * Like appendGTMPQExpBuffer(str, "%c", ch) but much faster. + */ +extern void appendGTMPQExpBufferChar(PQExpBuffer str, char ch); + +/*------------------------ + * appendBinaryGTMPQExpBuffer + * Append arbitrary binary data to a PQExpBuffer, allocating more space + * if necessary. + */ +extern void appendBinaryGTMPQExpBuffer(PQExpBuffer str, + const char *data, size_t datalen); + +#endif /* PQEXPBUFFER_H */ diff --git a/src/include/gtm/pqformat.h b/src/include/gtm/pqformat.h new file mode 100644 index 0000000000..3febf2cf2e --- /dev/null +++ b/src/include/gtm/pqformat.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * pqformat.h + * Definitions for formatting and parsing frontend/backend messages + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/pqformat.h,v 1.27 2009/01/01 17:23:59 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PQFORMAT_H +#define PQFORMAT_H + +#include "gtm/stringinfo.h" + +extern void pq_beginmessage(StringInfo buf, char msgtype); +extern void pq_sendbyte(StringInfo buf, int byt); +extern void pq_sendbytes(StringInfo buf, const char *data, int datalen); +extern void pq_sendcountedtext(StringInfo buf, const char *str, int slen, + bool countincludesself); +extern void pq_sendtext(StringInfo buf, const char *str, int slen); +extern void pq_sendstring(StringInfo buf, const char *str); +extern void pq_send_ascii_string(StringInfo buf, const char *str); +extern void pq_sendint(StringInfo buf, int i, int b); +extern void pq_sendint64(StringInfo buf, int64 i); +extern void pq_sendfloat4(StringInfo buf, float4 f); +extern void pq_sendfloat8(StringInfo buf, float8 f); +extern void pq_endmessage(Port *myport, StringInfo buf); + +extern void pq_puttextmessage(Port *myport, char msgtype, const char *str); +extern void pq_putemptymessage(Port *myport, char msgtype); + +extern int pq_getmsgbyte(StringInfo msg); +extern unsigned int pq_getmsgint(StringInfo msg, int b); +extern int64 pq_getmsgint64(StringInfo msg); +extern float4 pq_getmsgfloat4(StringInfo msg); +extern float8 pq_getmsgfloat8(StringInfo msg); +extern const char *pq_getmsgbytes(StringInfo msg, int datalen); +extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen); +extern char *pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes); +extern const char *pq_getmsgstring(StringInfo msg); +extern void pq_getmsgend(StringInfo msg); +extern int pq_getmsgunreadlen(StringInfo msg); + +#endif /* PQFORMAT_H */ diff --git a/src/include/gtm/pqsignal.h b/src/include/gtm/pqsignal.h new file mode 100644 index 0000000000..e3a53dc3ed --- /dev/null +++ b/src/include/gtm/pqsignal.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * pqsignal.h + * prototypes for the reliable BSD-style signal(2) routine. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/libpq/pqsignal.h,v 1.32 2008/01/01 19:45:58 momjian Exp $ + * + * NOTES + * This shouldn't be in libpq, but the monitor and some other + * things need it... + * + *------------------------------------------------------------------------- + */ +#ifndef PQSIGNAL_H +#define PQSIGNAL_H + +#include <signal.h> + +#ifdef HAVE_SIGPROCMASK +extern sigset_t UnBlockSig, + BlockSig, + AuthBlockSig; + +#define PG_SETMASK(mask) sigprocmask(SIG_SETMASK, mask, NULL) +#else +extern int UnBlockSig, + BlockSig, + AuthBlockSig; + +#ifndef WIN32 +#define PG_SETMASK(mask) sigsetmask(*((int*)(mask))) +#else +#define PG_SETMASK(mask) pqsigsetmask(*((int*)(mask))) +int pqsigsetmask(int mask); +#endif +#endif + +typedef void (*pqsigfunc) (int); + +extern void pqinitmask(void); + +extern pqsigfunc pqsignal(int signo, pqsigfunc func); + +#endif /* PQSIGNAL_H */ diff --git a/src/include/gtm/stringinfo.h b/src/include/gtm/stringinfo.h new file mode 100644 index 0000000000..197aa877a1 --- /dev/null +++ b/src/include/gtm/stringinfo.h @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * stringinfo.h + * Declarations/definitions for "StringInfo" functions. + * + * StringInfo provides an indefinitely-extensible string data type. + * It can be used to buffer either ordinary C strings (null-terminated text) + * or arbitrary binary data. All storage is allocated with palloc(). + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * $PostgreSQL: pgsql/src/include/lib/stringinfo.h,v 1.35 2008/01/01 19:45:57 momjian Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef STRINGINFO_H +#define STRINGINFO_H + +/*------------------------- + * StringInfoData holds information about an extensible string. + * data is the current buffer for the string (allocated with palloc). + * len is the current string length. There is guaranteed to be + * a terminating '\0' at data[len], although this is not very + * useful when the string holds binary data rather than text. + * maxlen is the allocated size in bytes of 'data', i.e. the maximum + * string size (including the terminating '\0' char) that we can + * currently store in 'data' without having to reallocate + * more space. We must always have maxlen > len. + * cursor is initialized to zero by makeStringInfo or initStringInfo, + * but is not otherwise touched by the stringinfo.c routines. + * Some routines use it to scan through a StringInfo. + *------------------------- + */ +typedef struct StringInfoData +{ + char *data; + int len; + int maxlen; + int cursor; +} StringInfoData; + +typedef StringInfoData *StringInfo; + + +/*------------------------ + * There are two ways to create a StringInfo object initially: + * + * StringInfo stringptr = makeStringInfo(); + * Both the StringInfoData and the data buffer are palloc'd. + * + * StringInfoData string; + * initStringInfo(&string); + * The data buffer is palloc'd but the StringInfoData is just local. + * This is the easiest approach for a StringInfo object that will + * only live as long as the current routine. + * + * To destroy a StringInfo, pfree() the data buffer, and then pfree() the + * StringInfoData if it was palloc'd. There's no special support for this. + * + * NOTE: some routines build up a string using StringInfo, and then + * release the StringInfoData but return the data string itself to their + * caller. At that point the data string looks like a plain palloc'd + * string. + *------------------------- + */ + +/*------------------------ + * makeStringInfo + * Create an empty 'StringInfoData' & return a pointer to it. + */ +extern StringInfo makeStringInfo(void); + +/*------------------------ + * initStringInfo + * Initialize a StringInfoData struct (with previously undefined contents) + * to describe an empty string. + */ +extern void initStringInfo(StringInfo str); + +/*------------------------ + * resetStringInfo + * Clears the current content of the StringInfo, if any. The + * StringInfo remains valid. + */ +extern void resetStringInfo(StringInfo str); + +/*------------------------ + * appendStringInfo + * Format text data under the control of fmt (an sprintf-style format string) + * and append it to whatever is already in str. More space is allocated + * to str if necessary. This is sort of like a combination of sprintf and + * strcat. + */ +extern void +appendStringInfo(StringInfo str, const char *fmt,...) +/* This extension allows gcc to check the format string */ +__attribute__((format(printf, 2, 3))); + +/*------------------------ + * appendStringInfoVA + * Attempt to format text data under the control of fmt (an sprintf-style + * format string) and append it to whatever is already in str. If successful + * return true; if not (because there's not enough space), return false + * without modifying str. Typically the caller would enlarge str and retry + * on false return --- see appendStringInfo for standard usage pattern. + */ +extern bool appendStringInfoVA(StringInfo str, const char *fmt, va_list args); + +/*------------------------ + * appendStringInfoString + * Append a null-terminated string to str. + * Like appendStringInfo(str, "%s", s) but faster. + */ +extern void appendStringInfoString(StringInfo str, const char *s); + +/*------------------------ + * appendStringInfoChar + * Append a single byte to str. + * Like appendStringInfo(str, "%c", ch) but much faster. + */ +extern void appendStringInfoChar(StringInfo str, char ch); + +/*------------------------ + * appendStringInfoCharMacro + * As above, but a macro for even more speed where it matters. + * Caution: str argument will be evaluated multiple times. + */ +#define appendStringInfoCharMacro(str,ch) \ + (((str)->len + 1 >= (str)->maxlen) ? \ + appendStringInfoChar(str, ch) : \ + (void)((str)->data[(str)->len] = (ch), (str)->data[++(str)->len] = '\0')) + +/*------------------------ + * appendBinaryStringInfo + * Append arbitrary binary data to a StringInfo, allocating more space + * if necessary. + */ +extern void appendBinaryStringInfo(StringInfo str, + const char *data, int datalen); + +/*------------------------ + * enlargeStringInfo + * Make sure a StringInfo's buffer can hold at least 'needed' more bytes. + */ +extern void enlargeStringInfo(StringInfo str, int needed); + +#endif /* STRINGINFO_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index f255c44d1c..078b6733e7 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.223 2009/06/11 14:49:11 momjian Exp $ * @@ -157,6 +158,9 @@ typedef enum NodeTag T_JoinExpr, T_FromExpr, T_IntoClause, +#ifdef PGXC + T_DistributeBy, +#endif /* * TAGS FOR EXPRESSION STATE NODES (execnodes.h) @@ -337,6 +341,7 @@ typedef enum NodeTag T_CreateUserMappingStmt, T_AlterUserMappingStmt, T_DropUserMappingStmt, + T_ExecDirectStmt, /* * TAGS FOR PARSE TREE NODES (parsenodes.h) diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 7793f66f20..e0515ba95d 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -12,6 +12,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.395 2009/06/18 01:27:02 tgl Exp $ * @@ -1335,6 +1336,9 @@ typedef struct CreateStmt List *options; /* options from WITH clause */ OnCommitAction oncommit; /* what do we do at COMMIT? */ char *tablespacename; /* table space to use, or NULL */ +#ifdef PGXC + DistributeBy *distributeby; /* distribution to use, or NULL */ +#endif } CreateStmt; /* ---------- @@ -2389,4 +2393,17 @@ typedef struct AlterTSConfigurationStmt bool missing_ok; /* for DROP - skip error if missing? */ } AlterTSConfigurationStmt; +/* PGXC_BEGIN */ +/* + * EXECUTE DIRECT statement + */ +typedef struct ExecDirectStmt +{ + NodeTag type; + bool coordinator; + List *nodes; + char *query; +} ExecDirectStmt; +/* PGXC_END */ + #endif /* PARSENODES_H */ diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index a41b0e2f7d..36c5e6e633 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -9,6 +9,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/nodes/primnodes.h,v 1.149 2009/06/11 14:49:11 momjian Exp $ * @@ -1174,4 +1175,30 @@ typedef struct FromExpr Node *quals; /* qualifiers on join, if any */ } FromExpr; +#ifdef PGXC +/*---------- + * DistributionType - how to distribute the data + * + *---------- + */ +typedef enum DistributionType +{ + DISTTYPE_REPLICATION, /* Replicated */ + DISTTYPE_HASH, /* Hash partitioned */ + DISTTYPE_ROUNDROBIN /* Round Robin */ +} DistributionType; + +/*---------- + * DistributeBy - represents a DISTRIBUTE BY clause in a CREATE TABLE statement + * + *---------- + */ +typedef struct DistributeBy +{ + NodeTag type; + DistributionType disttype; /* Distribution type */ + char *colname; /* Distribution column name */ +} DistributeBy; +#endif + #endif /* PRIMNODES_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 23f5d87a7a..aec7b6b3d9 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -9,6 +9,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION * $PostgreSQL: pgsql/src/include/parser/kwlist.h,v 1.2 2009/04/06 08:42:53 heikki Exp $ @@ -90,6 +91,7 @@ PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD) PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD) PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD) PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD) +PG_KEYWORD("coordinator", COORDINATOR, UNRESERVED_KEYWORD) PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD) PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD) PG_KEYWORD("create", CREATE, RESERVED_KEYWORD) @@ -125,9 +127,13 @@ PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD) PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD) PG_KEYWORD("desc", DESC, RESERVED_KEYWORD) PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD) +PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD) PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD) PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD) PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("distribute", DISTRIBUTE, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("do", DO, RESERVED_KEYWORD) PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD) PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD) @@ -169,6 +175,9 @@ PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD) PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD) PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD) PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("hash", HASH, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("having", HAVING, RESERVED_KEYWORD) PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD) PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD) @@ -243,6 +252,7 @@ PG_KEYWORD("no", NO, UNRESERVED_KEYWORD) PG_KEYWORD("nocreatedb", NOCREATEDB, UNRESERVED_KEYWORD) PG_KEYWORD("nocreaterole", NOCREATEROLE, UNRESERVED_KEYWORD) PG_KEYWORD("nocreateuser", NOCREATEUSER, UNRESERVED_KEYWORD) +PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD) PG_KEYWORD("noinherit", NOINHERIT, UNRESERVED_KEYWORD) PG_KEYWORD("nologin", NOLOGIN_P, UNRESERVED_KEYWORD) PG_KEYWORD("none", NONE, COL_NAME_KEYWORD) @@ -308,6 +318,9 @@ PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD) PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD) PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD) PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("replication", REPLICATION, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD) PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD) PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD) @@ -315,8 +328,14 @@ PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD) PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD) PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD) PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD) +#ifdef PGXC +PG_KEYWORD("robin", ROBIN, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD) PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("round", ROUND, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("row", ROW, COL_NAME_KEYWORD) PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD) PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD) diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index 089c907c0e..319699381d 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/parser/parse_utilcmd.h,v 1.4 2009/01/01 17:24:00 momjian Exp $ * @@ -24,5 +25,8 @@ extern IndexStmt *transformIndexStmt(IndexStmt *stmt, const char *queryString); extern void transformRuleStmt(RuleStmt *stmt, const char *queryString, List **actions, Node **whereClause); extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt); +#ifdef PGXC +extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname); +#endif #endif /* PARSE_UTILCMD_H */ diff --git a/src/include/pgxc/combiner.h b/src/include/pgxc/combiner.h new file mode 100644 index 0000000000..8c02627b57 --- /dev/null +++ b/src/include/pgxc/combiner.h @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * combiner.h + * + * Combine responses from multiple Data Nodes + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ? + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifndef COMBINER_H +#define COMBINER_H + +#include "postgres.h" +#include "tcop/dest.h" + +typedef enum +{ + COMBINE_TYPE_NONE, /* it is known that no row count, do not parse */ + COMBINE_TYPE_SUM, /* sum row counts (partitioned, round robin) */ + COMBINE_TYPE_AVG /* calculate average (replicated) */ +} CombineType; + +typedef enum +{ + REQUEST_TYPE_NOT_DEFINED, /* not determined yet */ + REQUEST_TYPE_COMMAND, /* OK or row count response */ + REQUEST_TYPE_QUERY, /* Row description response */ + REQUEST_TYPE_COPY_IN, /* Copy In response */ + REQUEST_TYPE_COPY_OUT /* Copy Out response */ +} RequestType; + + +typedef struct +{ + int node_count; + CombineType combine_type; + CommandDest dest; + int command_complete_count; + int row_count; + RequestType request_type; + int description_count; + List *simple_aggregates; +} ResponseCombinerData; + + +typedef ResponseCombinerData *ResponseCombiner; + +extern ResponseCombiner CreateResponseCombiner(int node_count, + CombineType combine_type, CommandDest dest); +extern int CombineResponse(ResponseCombiner combiner, char msg_type, + char *msg_body, size_t len); +extern bool ValidateAndCloseCombiner(ResponseCombiner combiner); +extern bool ValidateAndResetCombiner(ResponseCombiner combiner); +extern void AssignCombinerAggregates(ResponseCombiner combiner, List *simple_aggregates); + +#endif /* COMBINER_H */ diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h new file mode 100644 index 0000000000..e140445a28 --- /dev/null +++ b/src/include/pgxc/datanode.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * datanode.h + * + * Utility functions to communicate to Data Node + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ? + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifndef DATANODE_H +#define DATANODE_H +#include "combiner.h" +#include "nodes/pg_list.h" +#include "utils/snapshot.h" +#include <unistd.h> + +/* Connection to data node maintained by Pool Manager */ +typedef struct PGconn NODE_CONNECTION; + +/* Helper structure to access data node from Session */ +typedef enum +{ + DN_CONNECTION_STATE_IDLE, + DN_CONNECTION_STATE_BUSY, + DN_CONNECTION_STATE_COMPLETED, + DN_CONNECTION_STATE_ERROR + +} DNConnectionState; + +struct data_node_handle +{ + /* fd of the connection */ + int sock; + /* Connection state */ + char transaction_status; + DNConnectionState state; + char *error; + /* Output buffer */ + char *outBuffer; + size_t outSize; + size_t outEnd; + /* Input buffer */ + char *inBuffer; + size_t inSize; + size_t inStart; + size_t inEnd; + size_t inCursor; +}; +typedef struct data_node_handle DataNodeHandle; + +extern void InitMultinodeExecutor(void); + +/* Open/close connection routines (invoked from Pool Manager) */ +extern char *DataNodeConnStr(char *host, char *port, char *dbname, char *user, + char *password); +extern NODE_CONNECTION *DataNodeConnect(char *connstr); +extern void DataNodeClose(NODE_CONNECTION * conn); +extern int DataNodeConnected(NODE_CONNECTION * conn); +extern int DataNodeConnClean(NODE_CONNECTION * conn); +extern void DataNodeCleanAndRelease(int code, Datum arg); + +/* Multinode Executor */ +extern void DataNodeBegin(void); +extern int DataNodeCommit(CommandDest dest); +extern int DataNodeRollback(CommandDest dest); + +extern int DataNodeExec(const char *query, List *nodelist, CommandDest dest, Snapshot snapshot, bool force_autocommit, List *simple_aggregates, bool is_read_only); + +#endif diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h new file mode 100644 index 0000000000..1320b3c6f6 --- /dev/null +++ b/src/include/pgxc/locator.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * locator.h + * Externally declared locator functions + * + * + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#ifndef LOCATOR_H +#define LOCATOR_H + +#define LOCATOR_TYPE_REPLICATED 'R' +#define LOCATOR_TYPE_HASH 'H' +#define LOCATOR_TYPE_RANGE 'G' +#define LOCATOR_TYPE_SINGLE 'S' +#define LOCATOR_TYPE_RROBIN 'N' +#define LOCATOR_TYPE_CUSTOM 'C' + +#define HASH_SIZE 4096 +#define HASH_MASK 0x00000FFF; + +#include "utils/relcache.h" + + +typedef int PartAttrNumber; + +typedef struct +{ + Oid relid; + char locatorType; + PartAttrNumber partAttrNum; /* if partitioned */ + char *partAttrName; /* if partitioned */ + int nodeCount; + List *nodeList; + ListCell *roundRobinNode; /* points to next one to use */ +} RelationLocInfo; + + +extern char *PreferredDataNodes; + +extern void InitRelationLocInfo(); +extern char GetLocatorType(Oid relid); +extern char ConvertToLocatorType(int disttype); + +extern char *GetRelationHashColumn(RelationLocInfo * rel_loc_info); +extern RelationLocInfo *GetRelationLocInfo(Oid relid); +extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo * src_info); +extern List *GetRelationNodes(RelationLocInfo * rel_loc_info, long *partValue, + int isRead); +extern bool IsHashColumn(RelationLocInfo * rel_loc_info, char *part_col_name); +extern bool IsHashColumnForRelId(Oid relid, char *part_col_name); +extern int GetRoundRobinNode(Oid relid); + +extern bool IsHashDistributable(Oid col_type); +extern List *GetAllNodes(void); +extern int GetAnyDataNode(void); +extern void RelationBuildLocator(Relation rel); +extern void FreeRelationLocInfo(RelationLocInfo * relationLocInfo); + +#endif /* LOCATOR_H */ diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h new file mode 100644 index 0000000000..09ff2c0ada --- /dev/null +++ b/src/include/pgxc/pgxc.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * pgxc.h + * PG-XC + * + * + * Portions Copyright (c) 1996-2010 PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#ifdef PGXC + +extern bool isPGXCCoordinator; +extern bool isPGXCDataNode; + +#define IS_PGXC_COORDINATOR isPGXCCoordinator +#define IS_PGXC_DATANODE isPGXCDataNode + +#endif /* PGXC */ diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h new file mode 100644 index 0000000000..eda25a72bb --- /dev/null +++ b/src/include/pgxc/planner.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * planner.h + * Externally declared locator functions + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group ? + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#ifndef PGXCPLANNER_H +#define PGXCPLANNER_H + +/* for Query_Plan.exec_loc_type can have these OR'ed*/ +#define EXEC_ON_COORD 0x1 +#define EXEC_ON_DATA_NODES 0x2 + +/* Contains instructions on processing a step of a query. + * In the prototype this will be simple, but it will eventually + * evolve into a GridSQL-style QueryStep. + */ +typedef struct +{ + char *sql_statement; + List *nodelist; + List *simple_aggregates; /* simple aggregate to combine on this + * step */ +} Query_Step; + + +/* + * The PGXC plan to execute. + * In the prototype this will be simple, and queryStepList will + * contain just one step. + */ +typedef struct +{ + int exec_loc_type; + bool force_autocommit; /* For CREATE DATABASE */ + List *query_step_list; /* List of QuerySteps */ +} Query_Plan; + + +/* For handling simple aggregates (no group by present) + * For now, only MAX will be supported. + */ +typedef enum +{ + AGG_TYPE_MAX, + AGG_TYPE_MIN, + AGG_TYPE_COUNT, + AGG_TYPE_SUM, + AGG_TYPE_AVG +} SimpleAggType; + + +/* For handling simple aggregates */ +/* For now, only support int/long types */ +typedef struct +{ + int agg_type; /* SimpleAggType enum */ + int column_pos; /* Only use 1 for now */ + unsigned long ulong_value; + /* Datum agg_value; PGXCTODO - use Datum, support more types */ + int data_len; + int agg_data_type; + int response_count; +} SimpleAgg; + +/* forbid SQL if unsafe, useful to turn off for development */ +extern bool StrictStatementChecking; + +/* forbid SELECT even multi-node ORDER BY */ +extern bool StrictSelectChecking; + +extern Query_Plan * + GetQueryPlan(Node *parsetree, const char *sql_statement, List *querytree_list); +extern void + FreeQueryPlan(Query_Plan * query_plan); +extern bool IsHashDistributable(Oid col_type); + +#endif /* PGXCPLANNER_H */ diff --git a/src/include/pgxc/poolcomm.h b/src/include/pgxc/poolcomm.h new file mode 100644 index 0000000000..3c62f0662e --- /dev/null +++ b/src/include/pgxc/poolcomm.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * poolcomm.h + * + * Definitions for the Pooler-Seesion communications. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#ifndef POOLCOMM_H +#define POOLCOMM_H + +#include "lib/stringinfo.h" + +#define POOL_BUFFER_SIZE 1024 +#define Socket(port) (port).fdsock + +typedef struct +{ + /* file descriptors */ + int fdsock; + /* receive buffer */ + int RecvLength; + int RecvPointer; + char RecvBuffer[POOL_BUFFER_SIZE]; + /* send buffer */ + int SendPointer; + char SendBuffer[POOL_BUFFER_SIZE]; +} PoolPort; + +extern int pool_listen(unsigned short port, const char *unixSocketName); +extern int pool_connect(unsigned short port, const char *unixSocketName); +extern int pool_getbyte(PoolPort * port); +extern int pool_pollbyte(PoolPort * port); +extern int pool_getmessage(PoolPort * port, StringInfo s, int maxlen); +extern int pool_getbytes(PoolPort * port, char *s, size_t len); +extern int pool_putmessage(PoolPort * port, char msgtype, const char *s, size_t len); +extern int pool_putbytes(PoolPort * port, const char *s, size_t len); +extern int pool_flush(PoolPort * port); +extern int pool_sendfds(PoolPort * port, int *fds, int count); +extern int pool_recvfds(PoolPort * port, int *fds, int count); + +#endif /* POOLCOMM_H */ diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h new file mode 100644 index 0000000000..6e88fca3bc --- /dev/null +++ b/src/include/pgxc/poolmgr.h @@ -0,0 +1,130 @@ +/*------------------------------------------------------------------------- + * + * poolmgr.h + * + * Definitions for the data nodes connection pool. + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifndef POOLMGR_H +#define POOLMGR_H +#include <sys/time.h> +#include "datanode.h" +#include "poolcomm.h" +#include "storage/pmsignal.h" + +#define MAX_IDLE_TIME 60 + +/* TODO move? */ +typedef struct +{ + char *host; + char *port; + char *uname; + char *password; +} DataNodeConnectionInfo; + +/* Connection pool entry */ +typedef struct +{ + struct timeval released; + NODE_CONNECTION *conn; +} DataNodePoolSlot; + +/* Pool of connections to specified data nodes */ +typedef struct +{ + char *connstr; + int freeSize; /* available connections */ + int size; /* total pool size */ + DataNodePoolSlot **slot; +} DataNodePool; + +/* All pools for specified database */ +typedef struct databasepool +{ + Oid databaseId; + char *database; + DataNodePool **nodePools; /* one for each data node */ + struct databasepool *next; +} DatabasePool; + +/* Agent of client session (Pool Manager side) + * Acts as a session manager, grouping connections together + */ +typedef struct +{ + /* communication channel */ + PoolPort port; + DatabasePool *pool; + DataNodePoolSlot **connections; /* one for each data node */ +} PoolAgent; + +/* Handle to the pool manager (Session's side) */ +typedef struct +{ + /* communication channel */ + PoolPort port; +} PoolHandle; + +extern int NumDataNodes; +extern int MinPoolSize; +extern int MaxPoolSize; +extern int PoolerPort; + +extern bool PersistentConnections; + +extern char *DataNodeHosts; +extern char *DataNodePorts; +extern char *DataNodeUsers; +extern char *DataNodePwds; + +/* Initialize internal structures */ +extern int PoolManagerInit(void); + +/* Destroy internal structures */ +extern int PoolManagerDestroy(void); + +/* + * Get handle to pool manager. This function should be called just before + * forking off new session. It creates PoolHandle, PoolAgent and a pipe between + * them. PoolAgent is stored within Postmaster's memory context and Session + * closes it later. PoolHandle is returned and should be store in a local + * variable. After forking off it can be stored in global memory, so it will + * only be accessible by the process running the session. + */ +extern PoolHandle *GetPoolManagerHandle(void); + +/* + * Called from Postmaster(Coordinator) after fork. Close one end of the pipe and + * free memory occupied by PoolHandler + */ +extern void PoolManagerCloseHandle(PoolHandle * handle); + +/* + * Gracefully close connection to the PoolManager + */ +extern void PoolManagerDisconnect(PoolHandle * handle); + +/* + * Called from Session process after fork(). Associate handle with session + * for subsequent calls. Associate session with specified database and + * initialize respective connection pool + */ +extern void PoolManagerConnect(PoolHandle * handle, const char *database, List *nodes); + +/* Get pooled connections */ +extern int *PoolManagerGetConnections(List *nodelist); + +/* Retun connections back to the pool */ +extern void PoolManagerReleaseConnections(void); + +#endif diff --git a/src/include/postgres.h b/src/include/postgres.h index c1e4f77386..e8bfd5a391 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -9,6 +9,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1995, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/postgres.h,v 1.92 2009/01/01 17:23:55 momjian Exp $ * @@ -693,4 +694,7 @@ extern int ExceptionalCondition(const char *conditionName, const char *errorType, const char *fileName, int lineNumber); +//#define PGXC_COORD // for PGXC coordinator compiling +//#define PGXC_DATANODE // for PGXC data node compiling + #endif /* POSTGRES_H */ diff --git a/src/include/postmaster/autovacuum.h b/src/include/postmaster/autovacuum.h index 3175487af3..952291bcb0 100644 --- a/src/include/postmaster/autovacuum.h +++ b/src/include/postmaster/autovacuum.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/postmaster/autovacuum.h,v 1.15 2009/01/01 17:24:01 momjian Exp $ * @@ -60,4 +61,8 @@ extern void AutovacuumLauncherIAm(void); extern Size AutoVacuumShmemSize(void); extern void AutoVacuumShmemInit(void); +#ifdef PGXC /* PGXC_DATANODE */ +bool IsAutoVacuumWorkerProcess(void); +#endif + #endif /* AUTOVACUUM_H */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index b250d3f0f2..66a920ded0 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -143,8 +143,9 @@ typedef struct PROC_HDR * normal operation. Startup process also consumes one slot, but WAL * writer and autovacuum launcher are launched only after it has * exited. + * Also pool manager process is added */ -#define NUM_AUXILIARY_PROCS 3 +#define NUM_AUXILIARY_PROCS 4 /* configurable options */ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index fab84ee1a0..4431e1bc54 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.26 2009/06/11 14:49:12 momjian Exp $ * @@ -26,6 +27,10 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); extern void ProcArrayClearTransaction(PGPROC *proc); +#ifdef PGXC /* PGXC_DATANODE */ +extern void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip); +extern void UnsetGlobalSnapshotData(void); +#endif /* PGXC */ extern Snapshot GetSnapshotData(Snapshot snapshot); extern bool TransactionIdIsInProgress(TransactionId xid); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index b50944a547..9c87386288 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -76,7 +76,9 @@ enum config_group COMPAT_OPTIONS_CLIENT, PRESET_OPTIONS, CUSTOM_OPTIONS, - DEVELOPER_OPTIONS + DEVELOPER_OPTIONS, + DATA_NODES, + GTM }; /* diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index ca9913bda3..5f3a482877 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -6,6 +6,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.114 2009/06/11 14:49:13 momjian Exp $ * @@ -20,6 +21,9 @@ #include "catalog/pg_index.h" #include "fmgr.h" #include "nodes/bitmapset.h" +#ifdef PGXC +#include "pgxc/locator.h" +#endif #include "rewrite/prs2lock.h" #include "storage/block.h" #include "storage/relfilenode.h" @@ -205,6 +209,9 @@ typedef struct RelationData /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ +#ifdef PGXC + RelationLocInfo *rd_locator_info; +#endif } RelationData; /* diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index e5003b669a..835ba95291 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -5,6 +5,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/utils/snapshot.h,v 1.5 2009/06/11 14:49:13 momjian Exp $ * @@ -46,7 +47,11 @@ typedef struct SnapshotData */ TransactionId xmin; /* all XID < xmin are visible to me */ TransactionId xmax; /* all XID >= xmax are invisible to me */ + TransactionId recent_global_xmin; uint32 xcnt; /* # of xact ids in xip[] */ +#ifdef PGXC /* PGXC_COORD */ + uint32 max_xcnt; /* Max # of xact in xip[] */ +#endif TransactionId *xip; /* array of xact IDs in progress */ /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ int32 subxcnt; /* # of xact ids in subxip[], -1 if overflow */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 1428b28d15..e038041519 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -8,6 +8,7 @@ * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010 Nippon Telegraph and Telephone Corporation * * $PostgreSQL: pgsql/src/include/utils/syscache.h,v 1.74 2009/01/01 17:24:02 momjian Exp $ * @@ -64,6 +65,9 @@ enum SysCacheIdentifier OPEROID, OPFAMILYAMNAMENSP, OPFAMILYOID, +#ifdef PGXC + PGXCCLASSRELID, +#endif PROCNAMEARGSNSP, PROCOID, RELNAMENSP, |